def get_schoolar_data(author_name, cache_folder="scholarly", affiliation='UBC'): output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", cache_folder) cached = os.path.join(output_folder, format_author(author_name)) from_cache = False final_data = [] if not os.path.isfile(cached): try: # Retrieve the author's data, fill-in, and print search_query = scholarly.search_author( f'{author_name} {affiliation}') author = scholarly.fill(next(search_query)) # Print the titles of the author's publications titles = [pub['bib']['title'] for pub in author['publications']] final_data = [] for title in titles: logger.info("Processing " + Fore.YELLOW + title + Style.RESET_ALL) ret = get_publication(title) retries = 0 while not ret['success'] and retries < MAX_RETRIES_ON_ERROR: retries += 1 msg = "Error while querying CrossRef API ({}), retrying ({})...".format( ret["exception"], retries) logger.info(Fore.RED + msg + Style.RESET_ALL) ret = get_publication(title) sleep(3) if ret['success']: ret['original_title'] = title final_data.append(ret) else: logger.info(Fore.RED + '> Failed' + Style.RESET_ALL) final_data = list( filter(lambda k: k['result']['similarity'] >= 0.7, final_data)) final_data = sorted(final_data, key=lambda k: k['result']['similarity'], reverse=True) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except StopIteration: logger.info(Fore.RED + 'no more schoolar data available' + Style.RESET_ALL) with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) except Exception as ex: logger.exception(str(ex)) else: with open(cached, 'r') as fo: final_data = json.load(fo) from_cache = True return final_data, from_cache
def wikidata_keywords(author_name, keyword_authors, authors_keywords): resources_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "resources") author_file = format_author(author_name) resources_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "resources") gdata = load_google_data(author_file, resources_folder) for row in gdata: try: title = None if row['title']: title = next(iter(row['title'])) if not title and 'original_title' in row: title = row['original_title'] if not title: continue main_subject = get_publication_subject( author_name, row, title=title) # main subject, if not main_subject: continue for keyword in main_subject: keyword_authors[keyword].append(author_name) authors_keywords[author_name].append(keyword) except Exception as ex: logger.exception(ex)
def publications_info(author_name, test=False): logger.info('\nProcessing' + Fore.YELLOW + f' {author_name}' + Style.RESET_ALL) resources_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "resources" ) author_file = format_author(author_name) output_folder = os.path.join(resources_folder, "wikidata") cached = os.path.join(output_folder, author_file) if not os.path.isfile(cached): publication_lst = merge_sources( load_google_data(author_file, resources_folder), load_crossref_data(author_file, resources_folder) ) logger.info("\tFetching publications on " + Fore.YELLOW + 'Wikidata' + Style.RESET_ALL) in_wikidata, not_in_wikidata = [], [] for publication in publication_lst: data = wikidata_api.get_publication(publication["DOI"]) if data: in_wikidata.append(data) else: not_in_wikidata.append(publication) author_wikidata_id = get_wikidata_author_id(author_name, in_wikidata) detailed_wikidata_lst = get_wikidata_detailed_publications(in_wikidata) final_data = dict( author=author_name, wikidata_id=author_wikidata_id, wikidata=detailed_wikidata_lst, missing_data=not_in_wikidata ) if not test: with open(cached, 'w') as fo: json.dump(final_data, fo, indent=4, sort_keys=True) logger.info(Fore.YELLOW + '> done' + Style.RESET_ALL) else: logger.info(Fore.YELLOW + '> already on cache' + Style.RESET_ALL)
def wikidata_import(author_name, test=False): resources_folder = os.path.join( os.path.dirname(os.path.realpath(__file__)), "..", "resources") author_file = format_author(author_name) output_folder = os.path.join(resources_folder, "wikidata") cached = os.path.join(output_folder, author_file) result = [] if os.path.isfile(cached): with open(cached) as fo: data = json.load(fo) new_entry = dict( Len=data['author'], P31='Q5', # instance of = human P106='Q1650915', # occupation = researcher P463='Q106489997', # member of = Future Water Cluster P2561=data['author'], # name ) result.append(new_entry) return result
def publications_info(author_name, test=False): os.chdir(os.path.dirname(os.path.realpath(__file__))) wikidata_id = None output_folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "resources", "crossref") cached = os.path.join(output_folder, format_author(author_name)) if not os.path.isfile(cached): logger.info('Processing' + Fore.YELLOW + f' {author_name}' + Style.RESET_ALL) logger.info("Fetching author " + Fore.YELLOW + 'on crossref' + Style.RESET_ALL) author_data = crossref_api.get_author(author_name, max_results=1000, sim_threshold=0.90) with open(cached, 'w') as fo: json.dump(author_data, fo, indent=4, sort_keys=True) return author_data else: # logger.info(Fore.YELLOW + f'loaded {author_name} from cache' + Style.RESET_ALL) with open(cached, 'r') as fo: return json.load(fo)