def get_author(authorname, numcitations=-1): print("Searching Scholarly...") best_guess = None try: search_query = scholarly.search_author(authorname) if numcitations == -1: # we have no citation data for this author while True: try: best_guess = author_next_util(search_query) break except StopIteration: # end of author list reached (list has no members) break except ZeroDivisionError: # request limit for ip address reached print("Limit Reached... setting new proxy") set_new_proxy() search_query = scholarly.search_author(authorname) except: # timeout break else: while True: try: author = author_next_util(search_query) #print(author.affiliation) #print(extract_geo_from_text(author.affiliation)) if best_guess is None: best_guess = author else: try: # in case .citedby not contained if abs(numcitations - author.citedby) < abs(numcitations - best_guess.citedby): best_guess = author except: pass except StopIteration: # end of author list reached break except ZeroDivisionError: # request limit print("Limit Reached... setting new proxy") set_new_proxy() search_query = scholarly.search_author(authorname) except: # timeout break except: return None return best_guess
def fetch_citer_by_author(author): # first, get the author entry # check if item is a name or an id #if name if True: author_gen = scholarly.search_author(author) #else (if id) else: author_gen = scholarly.search_author_id(author) # what happens if there is more then one other? # check for that and throw an error matches = list(author_gen) assert (len(matches) == 1), "Author query not unique" author = matches[0] author.fill() # second, fetch all publications #authors_publications = [pub.bib['title'] for pub in author.publications] #print(authors_publications) # third, use these publications to query for citations for kk, pub in enumerate(author.publications): print(kk) # fetch the full publictation entry to get citations pub.fill() citers = [citation.bib['title'] for citation in pub.citedby] return citers
def plot_citations(author_name): m = Basemap(projection='mill', lon_0=180) m.drawmapboundary(fill_color='aqua') m.fillcontinents(color='coral', lake_color='aqua') search_query = scholarly.search_author(author_name) author = next(search_query).fill() print(author) for pub in [author.publications[0]]: print('Title: ', pub.bib['title']) pub = pub.fill() sleep(45) for citation in pub.citedby: print(citation) sleep(45) firstAuthorId = None while firstAuthorId is None or len(citation.bib['author_id']) == 0: firstAuthorId = citation.bib['author_id'].pop() if firstAuthorId is None: continue print(firstAuthorId) author = scholarly.search_author_id(firstAuthorId) sleep(45) lat, lon = get_location(author.affiliation) x, y = m(float(lon), float(lat)) m.plot(x, y, marker='D') plt.show()
def get_author(author_name): try: print("author_name : ",author_name) # Search by author name and return a generator of Author objects search_query = scholarly.search_author(author_name) # Populate the Author with information from their profile author = next(search_query).fill() except Exception as exception: print("exception") return jsonify(sucess=False, message='author was not found', status=HTTPStatus.NOT_FOUND.value, detatil=HTTPStatus.NOT_FOUND.description, ), HTTPStatus.NOT_FOUND try: # cache author information for tests file_name = author_name.replace(' ', '_').lower() + ".json" with open("cache/" + file_name, 'w') as file: file.write(author.toJSON()) except Exception as exception: print("exception") return jsonify(sucess=False, message=str(exception), status=HTTPStatus.INTERNAL_SERVER_ERROR.value, detatil=HTTPStatus.INTERNAL_SERVER_ERROR.description, ), HTTPStatus.INTERNAL_SERVER_ERROR return Response(author.toJSON(), HTTPStatus.OK, mimetype='application/json')
def quick_fetch_author(name): search_query = scholarly.search_author(name) author = scholarly.fill(next(search_query), sections=['publications', 'coauthors']) iterator = 0 data = {} publications = [] coauthors = [] for auth in author['coauthors']: coauthors.append(auth['name']) for pub in author['publications']: pub_info = {} make_attribute(pub_info, 'title', pub, 'bib') make_attribute(pub_info, 'num_citations', pub, 'plain') make_attribute(pub_info, 'pub_year', pub, 'bib') pub_info['_id'] = iterator iterator += 1 publications.append(pub_info) make_attribute(data, 'name', author, 'plain') make_attribute(data, 'coauthors', coauthors, 'obj') make_attribute(data, 'affiliation', author, 'plain') make_attribute(data, 'email_domain', author, 'plain') make_attribute(data, 'interests', author, 'plain') make_attribute(data, 'citedby', author, 'plain') make_attribute(data, 'number_of_publications', len(publications), 'obj') make_attribute(data, 'publications', publications, 'obj') return data
def main(author_name): """ Print all publications as JSON to STDOUT """ data = {} data['publications'] = [] author = scholarly.fill(next(scholarly.search_author(author_name))) for pub in author['publications']: pub_details = scholarly.fill(pub)['bib'] data['publications'].append({ 'authors': reformat_coauthors(pub_details['author'].split(' and ')), 'year': pub_details.get('pub_year', ''), 'title': pub_details.get('title', ''), 'journal': pub_details.get('journal', ''), 'volume': pub_details.get('volume', ''), 'issue': pub_details.get('issue', ''), 'pages': pub_details.get('pages', ''), 'citations': pub.get('num_citations', 0), 'pub_url': pub.get('pub_url', ''), #'eprint_url': pub.get('pub_url', '') # seems to be same as pub_url }) output_json(data)
def find_bbe_coauthors( name, institution="Caltech", start=2015, verbose=True): search_query = scholarly.search_author(name + ", " + institution) author = scholarly.fill(next(search_query)) coauthors = set() for i, pub in enumerate(author['publications']): # Make sure this is within the date range that we care about try: if int(pub['bib']['pub_year']) < start: # Skip this entry continue elif verbose: print(i, end=" ", flush=True) except KeyError: continue # Get the full data pub = scholarly.fill(pub) for author in pub['bib']['author'].split("and"): coauthors.add(author.strip()) print("") return coauthors
def read_author_data(author_name): print("reading data for {0:s}...".format(author_name)) scholarly.use_tor(9150, 9151, '') author = next(scholarly.search_author(author_name)).fill() a_data = { "name": name, "citedby": author.citedby, "citedby5y": author.citedby5y, "hindex": author.hindex, "hindex5y": author.hindex5y, "i10index": author.i10index, "i10index5y": author.i10index5y, "url_picture": author.url_picture, "pubs": [{ "title": pub.bib['title'], "year": pub.bib['year'] if "year" in pub.bib else -1, "citedby": pub.citedby if hasattr(pub, "citedby") else 0, "link": pub.id_citations if hasattr(pub, "id_citations") else "" } for pub in author.publications] } return a_data
def getAuthors(name, max_range=5): search_query = scholarly.search_author(name) authors_summary = [] for i in range(0, max_range): result = next(search_query, None) if result is None: break authors_summary.append({ "name": result.name, "affiliation": result.affiliation, "url_picture": result.url_picture, "id": result.id, }) # these parameters aren't present in all `Author` instances, # so we check if they exist before adding them to the author instance other_params = [ "citedby", "i10index", "hindex", "coauthors", "interests" ] for param in other_params: if param in dir(result): authors_summary[-1][param] = getattr(result, param) json = {"author_search_result": authors_summary} return json
def test_single_author(self): query = 'Steven A. Cholewiak' authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = authors[0].fill() self.assertEqual(author.name, u'Steven A. Cholewiak, PhD') self.assertEqual(author.id, u'4bahYMkAAAAJ')
def get_gs_data(author, titles): current_author = author search_query = scholarly.search_author(author) author = next(search_query).fill() for i in range(0, len(titles)): # ------------ ADD ALL INFO TO DB ------------ cites = author.publications[i].bib['cites'] title = author.publications[i].bib['title'] # url = info.bib['url'] year = 0 if 'year' in author.publications[i].bib: year = author.publications[i].bib['year'] else: year = 0 # authors = info.bib['author'] # => List of all authors # abstract = info.bib['abstract'] # if year == 'NA': # year = 0 paper = models.Paper.objects.create( title=title, co_author=current_author, citations_google_scholar=int(cites), publication_year=int(year), # abstract = abstract, # url = url, ) paper.save() print(f'{title}->{cites}')
def get_schoolar_data(author_name, cache_folder="scholarly", affiliation='UBC'): output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", cache_folder) cached = os.path.join(output_folder, format_author(author_name)) from_cache = False final_data = [] if not os.path.isfile(cached): try: # Retrieve the author's data, fill-in, and print search_query = scholarly.search_author( f'{author_name} {affiliation}') author = scholarly.fill(next(search_query)) # Print the titles of the author's publications titles = [pub['bib']['title'] for pub in author['publications']] final_data = [] for title in titles: logger.info("Processing " + Fore.YELLOW + title + Style.RESET_ALL) ret = get_publication(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format( ret["exception"], retries) logger.info(Fore.RED + msg + Style.RESET_ALL) ret = get_publication(title) sleep(3) if ret['success']: ret['original_title'] = title final_data.append(ret) else: logger.info(Fore.RED + '> Failed' + Style.RESET_ALL) final_data = list( filter(lambda k: k['result']['similarity'] >= 0.7, final_data)) final_data = sorted(final_data, key=lambda k: k['result']['similarity'], reverse=True) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except StopIteration: logger.info(Fore.RED + 'no more schoolar data available' + Style.RESET_ALL) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except Exception as ex: logger.exception(str(ex)) else: with open(cached, 'r') as fo: final_data = json.load(fo) from_cache = True return final_data, from_cache
def search_author(name): while True: search_query = scholarly.search_author(name) author = next(search_query).fill() if author: break return author
def test_search_author_filling_author_publications(self): """ Download a few publications for author and check that abstracts are populated with lengths within the expected limits. This process checks the process of filling a publication that is derived from the author profile page. """ query = 'Ipeirotis' authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = authors[0].fill() # Check that we can fill without problem the first two publications publications = author.publications[:2] for i in publications: i.fill() self.assertEqual(len(publications), 2) abstracts_populated = [ 'abstract' in p.bib.keys() for p in publications ] # Check that all publications have the abstract field populated self.assertTrue(all(abstracts_populated)) # Check that the abstracts have reasonable lengths abstracts_length = [len(p.bib['abstract']) for p in publications] abstracts_check = [1000 > n > 500 for n in abstracts_length] self.assertTrue(all(abstracts_check))
def get_all_coauthors(author_name, min_year, max_year, max_coauthors, include_no_year): """ Get a set of all coauthors """ author = scholarly.fill(next(scholarly.search_author(author_name))) all_coauthors = set() for pub in author['publications']: # Evaluate if publication year is indicated (if not, ignore depending # on presence of --include_no_year flag) if 'pub_year' in pub['bib']: pub_year = int(pub['bib']['pub_year']) elif include_no_year: pub_year = max_year else: pub_year = min_year - 1 # Evaluate whether publication falls within indicated timerange if min_year <= pub_year <= max_year: coauthors = scholarly.fill(pub)['bib']['author'].split(' and ') # Evaluate if number of coauthors meets optional threshold if len(coauthors) <= max_coauthors: [ all_coauthors.add(reformat_name(coauthor)) for coauthor in coauthors ] return all_coauthors
def getAuthorsPublications(name, _range=None): search_query = scholarly.search_author(name) author = next(search_query).fill(["publications"]) author_pubs = author.publications #determine range if _range is not None: try: _range = min(int(_range), len(author_pubs)) except: json = {"message": "Invalid range argument."} return json else: _range = 5 #create publications array pubs = [] for i in range(0, _range): try: bib = author_pubs[len(author_pubs) - i - 1].fill().bib except: bib = author_pubs[len(author_pubs) - i - 1].bib pub = { "title": bib.get("title", "unknown"), "author": bib.get("author", "unknown"), "summary": bib.get("abstract", "Summary not provided."), "year": bib.get("year", "unknown"), "url": bib.get("url", "#") } pubs.append(pub) #return json object json = {"publications": pubs} return json
def scholarlyBookAuthor(): # try: query = request.form['query'] search_query = scholarly.search_author(query) string = '[' author = next(search_query).fill() for pub in author.publications: search_book = scholarly.search_pubs(pub.bib['title']) book = next(search_book) # print (book) url = '' try: url = book['blb'] except: url = '' print(url) # print (basename(url)) # for i in range(5): # try: # author = next(search_query) # string += str(author) + "," # except: # print('') # # if (len(string) > 0): # string = string[:-1] # string=string+"]" return (str(string))
def test_search_author_multiple_authors(self): """ As of May 12, 2020 there are at least 24 'Cattanis's listed as authors and Giordano Cattani is one of them """ authors = [a.name for a in scholarly.search_author('cattani')] self.assertGreaterEqual(len(authors), 24) self.assertIn(u'Giordano Cattani', authors)
def test_search_author_single_author(self): query = 'Steven A. Cholewiak' authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = authors[0].fill() self.assertEqual(author.name, u'Steven A. Cholewiak, PhD') self.assertEqual(author.id, u'4bahYMkAAAAJ') pub = author.publications[2].fill() self.assertEqual(pub.id_citations, u'4bahYMkAAAAJ:ufrVoPGSRksC')
def test_search_author_single_author(self): query = 'Steven A. Cholewiak' authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1) author = scholarly.fill(authors[0]) self.assertEqual(author['name'], u'Steven A. Cholewiak, PhD') self.assertEqual(author['scholar_id'], u'4bahYMkAAAAJ') pub = scholarly.fill(author['publications'][2]) self.assertEqual(pub['author_pub_id'], u'4bahYMkAAAAJ:LI9QrySNdTsC')
def _search_author_by_name(self, name): authors = [] while True: try: authors = [a for a in scholarly.search_author(name)] #print("Retrieved authors") return authors except Exception as e: print("trying new proxy") self._set_new_proxy()
def proxied_search_author(author): while True: try: search_query = scholarly.search_author(author) print("Got the results of the query") return search_query except Exception as e: print(e) print("Trying new proxy") set_new_proxy()
def add_search(): search_form = SearchForm(formdata=request.args) academics = [] if search_form.search.data: for a in islice(scholarly.search_author(search_form.search.data), 0, 10): academics.append(a) return render_template("ui/add.html", academics=academics, search_form=search_form)
def get(self): parser = reqparse.RequestParser() parser.add_argument('name', required=True) name = parser.parse_args()['name'] search_query = scholarly.search_author(name) author = next(search_query).fill() cites_per_year = author.cites_per_year return {"cites_per_year": cites_per_year}
def get_submitter_info(submitter, paper_title): try: query = scholarly.search_author(submitter) while True: author = next(query).fill() for pub in author.publications: title_match_score = SequenceMatcher(a=pub.bib["title"], b=paper_title).ratio() if title_match_score >= 0.9: return author except: return None
def save_csv(): _file = open("output.csv", "w+") search_query = scholarly.search_author("Mayken Espinoza-Andaluz") author = scholarly.fill(next(search_query)) _file.write("title|authors|year|abstract\n") for pub in author["publications"]: title = pub["bib"]["title"] year = pub["bib"]["pub_year"] abstract = pub["bib"]["title"] _file.write(f"{title}|{authors}|{year}|{abstract}\n") _file.close()
def get_author(author,university=""): url_part = "https://scholar.google.co.in/citations?user="******", "+university if university!='' else '')) try: authorResult = next(authorSearch) except: return "Not Found" authorRaw = scholarly.fill(authorResult,sections=['basics','indices','publications']) authorDetails = {'name':authorRaw['name'],'affiliation':authorRaw['affiliation'],'email_domain':authorRaw['email_domain'],'interests':authorRaw['interests'] ,'publications':len(authorRaw['publications']),'citedby':authorRaw['citedby'],'hindex':authorRaw['hindex'],'i10index':authorRaw['i10index'] ,'gscholar_url':url_part+authorRaw['scholar_id']} return authorDetails
def fetch_citations(author, filesave="citations.json", proxy="", proxy_list=""): """ Fetch citations from google scholar using scholarly """ if proxy != "": print("Setting up proxy ", proxy) scholarly.use_proxy(scholarly.SingleProxy(http=proxy, https=proxy)) if proxy_list != "": lproxies = open(proxy_list, 'r').readlines() def proxy_gen(): if proxy_gen.counter >= len(lproxies): raise IndexError("We ran out of proxies...") proxy = lproxies[proxy_gen.counter] if not proxy.startswith("http"): proxy = "http://" + proxy proxy_gen.counter += 1 return proxy proxy_gen.counter = 0 scholarly.use_proxy(proxy_gen) print("Looking up " + author) search = scholarly.search_author(author) author = scholarly.fill(next(search)) publications = [] for i, pub in enumerate(author['publications']): cites = pub['num_citations'] # often this gets messed up upon .fill() if "pub_year" in pub['bib']: pubyear = pub['bib'][ "pub_year"] # also this gets messed up upon .fill() pub = scholarly.fill(pub) pub['bib']["pub_year"] = pubyear else: pub = scholarly.fill(pub) if not "pub_year" in pub.bib: # skip publications that really don't have a year, # they probably are crap that was picked up by the search robot continue pub['num_citations'] = cites print("Fetching: " + str(i) + "/" + str(len(author['publications'])) + ": " + pub['bib']["title"] + " (" + str(pub['bib']["pub_year"]) + ")") pub['bib'].pop("abstract", None) pub.pop("source", None) publications.append(pub) f = open(filesave, "w") f.write(json.dumps(publications)) f.close()
def get_author(search): """ Queries google scholar to find an author given a search string. If != 0 results are found it gives an error """ authors = list(scholarly.search_author(search)) if len(authors) > 1: raise ValueError(f'Found >1 authors with search string: {searc}, try something more specifc') elif not authors: raise ValueError(f'Could not find authors with search string: {search}') return authors[0].fill(sections=['basics', 'indices', 'publications'])
def test_scholar(): ''' test de la connexion google scholar ''' pp = pprint.PrettyPrinter(2) tab_author = ['xxxx bidule', 'Steven A. Cholewiak'] for author in tab_author: generator = scholarly.search_author(author) doc = get_author(generator) if doc is None: print('Author not found:' + author) else: print('Author found:' + author) pp.pprint(doc) '''