def get_papers_for_orcid_id(self, orcid_id): orcid_id = normalize_orcid_id(orcid_id) lb.i(f"Querying ADS for orcid id " + orcid_id) query = f"orcid:({orcid_id})" documents = self._inner_query_for_author(query, 1) author_record = AuthorRecord(name=ADSName.parse(orcid_id, preserve=True), documents=[]) names = set() for document in documents: try: i = document.orcid_ids.index(orcid_id) except ValueError: lb.w(f"ORCID ID not found in {document.bibcode}") continue author_record.documents.append(document.bibcode) names.add(document.authors[i]) # Find the most-detailed form of the name if len(names): names = [ADSName.parse(n) for n in names] intermed = [(n.level_of_detail, len(n.full_name), n) for n in names] intermed.sort(reverse=True) author_record.name = intermed[0][-1] return author_record, documents
def get_document(self, bibcode): lb.i("Querying ADS for bibcode " + bibcode) t_start = time.time() params = {"q": "bibcode:" + bibcode, "fl": ",".join(FIELDS)} r = requests.get("https://api.adsabs.harvard.edu/v1/search/query", params=params, headers={"Authorization": f"Bearer {ADS_TOKEN}"}) t_stop = time.time() lb.on_network_complete(t_stop - t_start) rec = self._article_to_record(r.json()['response']['docs'][0]) return rec
def find_route(request): try: data, code, headers, cache_key = backend_common.find_route( request, load_cached_result=False) if data is None: # The result is already cached---refer the user to the cache file response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format( CLOUD_STORAGE_BUCKET_NAME, cache_key)} return json.dumps(response), code, headers if len(data.encode('utf-8')) > MAXIMUM_RESPONSE_SIZE: lb.i("Sending large result as separate download") response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format( CLOUD_STORAGE_BUCKET_NAME, cache_key)} return json.dumps(response), code, headers return data, code, headers except: lb.log_exception()
def _do_query_for_author(self, params, n_authors): t_start = time.time() r = requests.get("https://api.adsabs.harvard.edu/v1/search/query", params=params, headers={"Authorization": f"Bearer {ADS_TOKEN}"}, timeout=(6.05, 6 * n_authors)) t_elapsed = time.time() - t_start lb.on_network_complete(t_elapsed) if t_elapsed > 2 * n_authors: lb.w(f"Long ADS query: {t_elapsed:.2f} s for {params['q']}") if 'X-RateLimit-Remaining' in r.headers: if int(r.headers.get('X-RateLimit-Remaining', 1)) <= 1: reset = time.strftime( "%Y-%m-%d %H:%M:%S UTC", time.gmtime(int(r.headers.get('X-RateLimit-Reset', 0)))) raise ADSRateLimitError(r.headers.get('X-RateLimit-Limit'), reset) else: lb.w("ADS query did not return X-RateLimit-Remaining") r_data = r.json() if "error" in r_data: raise ADSError('ads_error', r_data['error']['msg']) documents = self._articles_to_records(r_data['response']['docs']) if r_data['response']['numFound'] > len(documents) + params['start']: lb.i(f"Got too many documents in request." f" numFound: {r_data['response']['numFound']}" f" start: {params['start']}" f" docs rec'd: {len(documents)}") params['start'] += len(documents) documents.extend(self._do_query_for_author(params, n_authors)) return documents
def _try_generating_author_record(self, author: ADSName): """Generate a requested record from existing cache data E.g. If "=Doe, J." is searched for and "Doe, J." is already cached, we can generate the requested record without going to ADS.""" if not (author.require_exact_match or author.require_more_specific or author.require_less_specific): # This author does not have a modifier character in front return None selected_documents = [] try: author_record = cache_buddy.load_author(author.full_name) except CacheMiss: return None try: documents = cache_buddy.load_documents(author_record.documents) except CacheMiss: return None # TODO: This can be done with author_record.appears_as for doc in documents: for coauthor in doc.authors: if coauthor == author: selected_documents.append(doc.bibcode) break new_author_record = AuthorRecord(name=author, documents=selected_documents) self._fill_in_coauthors(new_author_record) cache_buddy.cache_author(new_author_record) lb.i(f"Author record for {str(author)} constructed from cache") return new_author_record
def get_papers_for_author(self, query_author): query_author = ADSName.parse(query_author) query_authors = self._select_authors_to_prefetch() if query_author not in query_authors: query_authors.append(query_author) lb.i(f"Querying ADS for author " + query_author.qualified_full_name) if len(query_authors) > 1: lb.i(" Also prefetching. Query: " + "; ".join([a.qualified_full_name for a in query_authors])) query_strings = [] for author in query_authors: query_string = '"' + author.full_name + '"' if author.require_exact_match: query_string = "=" + query_string query_strings.append(query_string) query = " OR ".join(query_strings) query = f"author:({query})" documents = self._inner_query_for_author(query, len(query_authors)) author_records = NameAwareDict() for author in query_authors: author_records[author] = AuthorRecord(name=author, documents=[]) # We need to go through all the documents and match them to our # author list. This is critically important if we're pre-fetching # authors, but it's also important to support the "<" and ">" # specificity selectors for author names for document in documents: matched = False names = [ADSName.parse(n) for n in document.authors] for name in names: try: author_records[name].documents.append(document.bibcode) matched = True except KeyError: pass if (not matched and all( not a.require_more_specific and not a.require_less_specific for a in query_authors)): # See if we can guess which names should have been matched guesses = [] doc_authors = [n.full_name for n in names] doc_authors_initialized = \ [n.convert_to_initials().full_name for n in names] for query_author in query_authors: guess = difflib.get_close_matches(query_author.full_name, doc_authors, n=1, cutoff=0.8) if len(guess): guesses.append( f"{query_author.full_name} -> {guess[0]}") else: # Try again, changing names to use initials throughout guess = difflib.get_close_matches( query_author.convert_to_initials().full_name, doc_authors_initialized, n=1, cutoff=0.7) if len(guess): # Having found a match with initialized names, # report using the full form of each name chosen_doc_author = doc_authors[ doc_authors_initialized.index(guess[0])] guesses.append(f"{query_author.full_name}" f" -> {chosen_doc_author}") msg = "ADS Buddy: No matches for " + document.bibcode if len(guesses): msg += " . Guesses: " + "; ".join(guesses) lb.w(msg) for author_record in author_records.values(): # Remove any duplicate document listings # Becomes important for papers with _many_ authors, e.g. LIGO # papers, which use only initials and so can have duplicate names author_record.documents = sorted(set(author_record.documents)) if len(query_authors) == 1: return author_records[query_author], documents else: return author_records, documents
def clear_cache(): clear_start = time.time() cache_buddy.clear_stale_data() lb.i(f"Cleared stale cache data in {time.time() - clear_start:.2f} s")
import json from cache import cache_buddy import backend_common from log_buddy import lb # Cloud Function responses cannot be larger than 10 MiB. If our response # is larger, put it in Cloud Storage and return a link instead. MAXIMUM_RESPONSE_SIZE = 9.5 * 1024 * 1024 # This bucket should be set to auto-delete files after a day or whatever from local_config import CLOUD_STORAGE_BUCKET_NAME CLOUD_STORAGE_URL_FORMAT = "https://storage.googleapis.com/storage/v1/b/{}/o/{}?alt=media" lb.set_log_level(lb.INFO) lb.i("Instance cold start") def find_route(request): try: data, code, headers, cache_key = backend_common.find_route( request, load_cached_result=False) if data is None: # The result is already cached---refer the user to the cache file response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format( CLOUD_STORAGE_BUCKET_NAME, cache_key)} return json.dumps(response), code, headers if len(data.encode('utf-8')) > MAXIMUM_RESPONSE_SIZE: lb.i("Sending large result as separate download")