Пример #1
0
    def get_papers_for_orcid_id(self, orcid_id):
        orcid_id = normalize_orcid_id(orcid_id)
        lb.i(f"Querying ADS for orcid id " + orcid_id)
        query = f"orcid:({orcid_id})"

        documents = self._inner_query_for_author(query, 1)

        author_record = AuthorRecord(name=ADSName.parse(orcid_id,
                                                        preserve=True),
                                     documents=[])
        names = set()
        for document in documents:
            try:
                i = document.orcid_ids.index(orcid_id)
            except ValueError:
                lb.w(f"ORCID ID not found in {document.bibcode}")
                continue
            author_record.documents.append(document.bibcode)
            names.add(document.authors[i])

        # Find the most-detailed form of the name
        if len(names):
            names = [ADSName.parse(n) for n in names]
            intermed = [(n.level_of_detail, len(n.full_name), n)
                        for n in names]
            intermed.sort(reverse=True)
            author_record.name = intermed[0][-1]
        return author_record, documents
Пример #2
0
    def get_document(self, bibcode):
        lb.i("Querying ADS for bibcode " + bibcode)
        t_start = time.time()

        params = {"q": "bibcode:" + bibcode, "fl": ",".join(FIELDS)}
        r = requests.get("https://api.adsabs.harvard.edu/v1/search/query",
                         params=params,
                         headers={"Authorization": f"Bearer {ADS_TOKEN}"})
        t_stop = time.time()
        lb.on_network_complete(t_stop - t_start)

        rec = self._article_to_record(r.json()['response']['docs'][0])
        return rec
Пример #3
0
def find_route(request):
    try:
        data, code, headers, cache_key = backend_common.find_route(
            request, load_cached_result=False)
        
        if data is None:
            # The result is already cached---refer the user to the cache file
            response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format(
                CLOUD_STORAGE_BUCKET_NAME, cache_key)}
            return json.dumps(response), code, headers
        
        if len(data.encode('utf-8')) > MAXIMUM_RESPONSE_SIZE:
            lb.i("Sending large result as separate download")
            response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format(
                CLOUD_STORAGE_BUCKET_NAME, cache_key)}
            return json.dumps(response), code, headers
        return data, code, headers
    except:
        lb.log_exception()
Пример #4
0
    def _do_query_for_author(self, params, n_authors):
        t_start = time.time()
        r = requests.get("https://api.adsabs.harvard.edu/v1/search/query",
                         params=params,
                         headers={"Authorization": f"Bearer {ADS_TOKEN}"},
                         timeout=(6.05, 6 * n_authors))
        t_elapsed = time.time() - t_start
        lb.on_network_complete(t_elapsed)
        if t_elapsed > 2 * n_authors:
            lb.w(f"Long ADS query: {t_elapsed:.2f} s for {params['q']}")

        if 'X-RateLimit-Remaining' in r.headers:
            if int(r.headers.get('X-RateLimit-Remaining', 1)) <= 1:
                reset = time.strftime(
                    "%Y-%m-%d %H:%M:%S UTC",
                    time.gmtime(int(r.headers.get('X-RateLimit-Reset', 0))))
                raise ADSRateLimitError(r.headers.get('X-RateLimit-Limit'),
                                        reset)
        else:
            lb.w("ADS query did not return X-RateLimit-Remaining")

        r_data = r.json()
        if "error" in r_data:
            raise ADSError('ads_error', r_data['error']['msg'])

        documents = self._articles_to_records(r_data['response']['docs'])

        if r_data['response']['numFound'] > len(documents) + params['start']:
            lb.i(f"Got too many documents in request."
                 f" numFound: {r_data['response']['numFound']}"
                 f" start: {params['start']}"
                 f" docs rec'd: {len(documents)}")
            params['start'] += len(documents)
            documents.extend(self._do_query_for_author(params, n_authors))

        return documents
Пример #5
0
    def _try_generating_author_record(self, author: ADSName):
        """Generate a requested record from existing cache data
        
        E.g. If "=Doe, J." is searched for and "Doe, J." is already cached,
        we can generate the requested record without going to ADS."""

        if not (author.require_exact_match or author.require_more_specific
                or author.require_less_specific):
            # This author does not have a modifier character in front
            return None

        selected_documents = []
        try:
            author_record = cache_buddy.load_author(author.full_name)
        except CacheMiss:
            return None

        try:
            documents = cache_buddy.load_documents(author_record.documents)
        except CacheMiss:
            return None
        # TODO: This can be done with author_record.appears_as
        for doc in documents:
            for coauthor in doc.authors:
                if coauthor == author:
                    selected_documents.append(doc.bibcode)
                    break

        new_author_record = AuthorRecord(name=author,
                                         documents=selected_documents)
        self._fill_in_coauthors(new_author_record)

        cache_buddy.cache_author(new_author_record)

        lb.i(f"Author record for {str(author)} constructed from cache")
        return new_author_record
Пример #6
0
    def get_papers_for_author(self, query_author):
        query_author = ADSName.parse(query_author)

        query_authors = self._select_authors_to_prefetch()
        if query_author not in query_authors:
            query_authors.append(query_author)

        lb.i(f"Querying ADS for author " + query_author.qualified_full_name)
        if len(query_authors) > 1:
            lb.i(" Also prefetching. Query: " +
                 "; ".join([a.qualified_full_name for a in query_authors]))

        query_strings = []
        for author in query_authors:
            query_string = '"' + author.full_name + '"'
            if author.require_exact_match:
                query_string = "=" + query_string
            query_strings.append(query_string)
        query = " OR ".join(query_strings)
        query = f"author:({query})"

        documents = self._inner_query_for_author(query, len(query_authors))

        author_records = NameAwareDict()
        for author in query_authors:
            author_records[author] = AuthorRecord(name=author, documents=[])
        # We need to go through all the documents and match them to our
        # author list. This is critically important if we're pre-fetching
        # authors, but it's also important to support the "<" and ">"
        # specificity selectors for author names
        for document in documents:
            matched = False
            names = [ADSName.parse(n) for n in document.authors]
            for name in names:
                try:
                    author_records[name].documents.append(document.bibcode)
                    matched = True
                except KeyError:
                    pass
            if (not matched and all(
                    not a.require_more_specific and not a.require_less_specific
                    for a in query_authors)):
                # See if we can guess which names should have been matched
                guesses = []
                doc_authors = [n.full_name for n in names]
                doc_authors_initialized = \
                    [n.convert_to_initials().full_name for n in names]
                for query_author in query_authors:
                    guess = difflib.get_close_matches(query_author.full_name,
                                                      doc_authors,
                                                      n=1,
                                                      cutoff=0.8)
                    if len(guess):
                        guesses.append(
                            f"{query_author.full_name} -> {guess[0]}")
                    else:
                        # Try again, changing names to use initials throughout
                        guess = difflib.get_close_matches(
                            query_author.convert_to_initials().full_name,
                            doc_authors_initialized,
                            n=1,
                            cutoff=0.7)
                        if len(guess):
                            # Having found a match with initialized names,
                            # report using the full form of each name
                            chosen_doc_author = doc_authors[
                                doc_authors_initialized.index(guess[0])]
                            guesses.append(f"{query_author.full_name}"
                                           f" -> {chosen_doc_author}")
                msg = "ADS Buddy: No matches for " + document.bibcode
                if len(guesses):
                    msg += " . Guesses: " + "; ".join(guesses)
                lb.w(msg)

        for author_record in author_records.values():
            # Remove any duplicate document listings
            # Becomes important for papers with _many_ authors, e.g. LIGO
            # papers, which use only initials and so can have duplicate names
            author_record.documents = sorted(set(author_record.documents))

        if len(query_authors) == 1:
            return author_records[query_author], documents
        else:
            return author_records, documents
Пример #7
0
def clear_cache():
    clear_start = time.time()
    cache_buddy.clear_stale_data()
    lb.i(f"Cleared stale cache data in {time.time() - clear_start:.2f} s")
Пример #8
0
import json

from cache import cache_buddy

import backend_common
from log_buddy import lb

# Cloud Function responses cannot be larger than 10 MiB. If our response
# is larger, put it in Cloud Storage and return a link instead.
MAXIMUM_RESPONSE_SIZE = 9.5 * 1024 * 1024
# This bucket should be set to auto-delete files after a day or whatever
from local_config import CLOUD_STORAGE_BUCKET_NAME
CLOUD_STORAGE_URL_FORMAT = "https://storage.googleapis.com/storage/v1/b/{}/o/{}?alt=media"

lb.set_log_level(lb.INFO)
lb.i("Instance cold start")


def find_route(request):
    try:
        data, code, headers, cache_key = backend_common.find_route(
            request, load_cached_result=False)
        
        if data is None:
            # The result is already cached---refer the user to the cache file
            response = {"responseAtUrl": CLOUD_STORAGE_URL_FORMAT.format(
                CLOUD_STORAGE_BUCKET_NAME, cache_key)}
            return json.dumps(response), code, headers
        
        if len(data.encode('utf-8')) > MAXIMUM_RESPONSE_SIZE:
            lb.i("Sending large result as separate download")