Пример #1
0
def _parse(res, n, url, params, verbose, *args, **kwds):
    """Auxiliary function to download results and parse json."""
    cursor = "cursor" in params
    if not cursor:
        start = params["start"]
    _json = res.get('search-results', {}).get('entry', [])
    if verbose:
        chunk = 1
        # Roundup + 1 for the final iteration
        chunks = int(n / params['count']) + (n % params['count'] > 0) + 1
        print(f'Downloading results for query "{params["query"]}":')
        print_progress(chunk, chunks)
    # Download the remaining information in chunks
    while n > 0:
        n -= params["count"]
        if cursor:
            pointer = res['search-results']['cursor'].get('@next')
            params.update({'cursor': pointer})
        else:
            start += params["count"]
            params.update({'start': start})
        resp = get_content(url, params, *args, **kwds)
        data = resp.json()
        _json.extend(data.get('search-results', {}).get('entry', []))
        if verbose:
            chunk += 1
            print_progress(chunk, chunks)
    return _json, resp.headers
Пример #2
0
 def get_coauthors(self) -> Optional[List[NamedTuple]]:
     """Retrieves basic information about co-authors as a list of
     namedtuples in the form
     (surname, given_name, id, areas, affiliation_id, name, city, country),
     where areas is a list of subject area codes joined by "; ".
     Note: Method retrieves information via individual queries which will
     not be cached.  The Scopus API returns 160 coauthors at most.
     """
     SIZE = 25
     # Get number of authors to search for
     url = self.coauthor_link
     if not url:
         return None
     res = get_content(url, api="AuthorSearch")
     data = loads(res.text)['search-results']
     N = int(data.get('opensearch:totalResults', 0))
     # Store information in namedtuples
     fields = 'surname given_name id areas affiliation_id name city country'
     coauth = namedtuple('Coauthor', fields)
     coauthors = []
     # Iterate over search results in chunks of `SIZE` results
     count = SIZE
     start = 0
     while start < N:
         params = {'start': start, 'count': count, 'accept': 'json'}
         res = get_content(url, api="AuthorSearch", params=params)
         data = loads(res.text)['search-results'].get('entry', [])
         # Extract information for each coauthor
         for entry in data:
             aff = entry.get('affiliation-current', {})
             try:
                 areas = [a['$'] for a in entry.get('subject-area', [])]
             except TypeError:  # Only one subject area given
                 areas = [entry['subject-area']['$']]
             new = coauth(
                 surname=entry['preferred-name']['surname'],
                 given_name=entry['preferred-name'].get('given-name'),
                 id=int(entry['dc:identifier'].split(':')[-1]),
                 areas='; '.join(areas),
                 name=aff.get('affiliation-name'),
                 affiliation_id=aff.get('affiliation-id'),
                 city=aff.get('affiliation-city'),
                 country=aff.get('affiliation-country'))
             coauthors.append(new)
         start += SIZE
     return coauthors or None
Пример #3
0
 def get_coauthors(self):
     """Retrieves basic information about co-authors as a list of
     namedtuples in the form
     (surname, given_name, id, areas, affiliation_id, name, city, country),
     where areas is a list of subject area codes joined by "; ".
     Note: These information will not be cached and are slow for large
     coauthor groups.
     """
     SIZE = 25
     # Get number of authors to search for
     url = self.coauthor_link
     res = get_content(url=url)
     data = loads(res.text)['search-results']
     N = int(data.get('opensearch:totalResults', 0))
     # Store information in namedtuples
     fields = 'surname given_name id areas affiliation_id name city country'
     coauth = namedtuple('Coauthor', fields)
     coauthors = []
     # Iterate over search results in chunks of 25 results
     count = SIZE
     start = 0
     while start < N:
         params = {'start': start, 'count': count}
         res = get_content(url=url, params=params, accept='json')
         data = loads(res.text)['search-results'].get('entry', [])
         # Extract information for each coauthor
         for entry in data:
             aff = entry.get('affiliation-current', {})
             try:
                 areas = [a['$'] for a in entry.get('subject-area', [])]
             except TypeError:  # Only one subject area given
                 areas = [entry['subject-area']['$']]
             new = coauth(
                 surname=entry['preferred-name']['surname'],
                 given_name=entry['preferred-name'].get('given-name'),
                 id=entry['dc:identifier'].split(':')[-1],
                 areas='; '.join(areas),
                 name=aff.get('affiliation-name'),
                 affiliation_id=aff.get('affiliation-id'),
                 city=aff.get('affiliation-city'),
                 country=aff.get('affiliation-country'))
             coauthors.append(new)
         start += SIZE
     return coauthors or None
Пример #4
0
    def __init__(self,
                 fname,
                 refresh,
                 params,
                 url,
                 download=None,
                 max_entries=None,
                 verbose=False,
                 *args,
                 **kwds):
        """Class intended as base class for superclasses.

        Parameters
        ----------
        fname : str
            The filename (including path) of the cache object.

        refresh : bool or int
            Whether to refresh the cached file if it exists or not.  If int
            is passed, cached file will be refreshed if the number of days
            since last modification exceeds that value.

        params : dict
            Dictionary used as header during the API request.

        url : str
            The URL to be accessed.

        download : bool (optional, default=None)
            Whether to download the query or not.  Has no effect for
            retrieval requests.

        max_entries : int (optional, default=None)
            Raise error when the number of results is beyond this number.
            Has no effect for retrieval requests.

        verbose : bool (optional, default=False)
            Whether to print a progress bar for multip-page requests.

        *args, **kwds
            Arguments and key-value pairings to be passed on
            to `get_content()`.

        Raises
        ------
        ScopusQueryError
            If `refresh` is neither boolean nor numeric.

        ValueError
            If `refresh` is neither boolean nor numeric.
        """
        # Compare age of file to test whether we refresh
        refresh, exists, mod_ts = _check_file_age(fname, refresh)

        # Read or dowload eventually with caching
        search_request = "query" in params
        if exists and not refresh:
            self._mdate = mod_ts
            if search_request:
                with open(fname, 'rb') as f:
                    self._json = [loads(line) for line in f.readlines()]
                self._n = len(self._json)
            else:
                with open(fname, 'rb') as f:
                    self._json = loads(f.read().decode('utf-8'))
        else:
            resp = get_content(url, params, *args, **kwds)
            header = resp.headers
            if search_request:
                # Download results page-wise
                res = resp.json()
                n = int(res['search-results'].get('opensearch:totalResults',
                                                  0))
                self._n = n
                self._json = []
                data = "".encode('utf-8')
                cursor_false = "cursor" in params and not params["cursor"]
                if cursor_false and n > max_entries:
                    # Stop if there are too many results
                    text = (f'Found {n} matches. Set max_entries to a higher '
                            f'number, change your query ({query}) or set '
                            'subscription=True')
                    raise ScopusQueryError(text)
                if n and download:
                    data, header = _parse(res, n, url, params, verbose, *args,
                                          **kwds)
                    self._json = data
            else:
                data = resp.text.encode('utf-8')
                self._json = loads(data)
            # Set private variables
            self._mdate = time()
            self._header = header
            # Finally write data
            _write_json(fname, data)
Пример #5
0
    def __init__(self,
                 identifier,
                 api,
                 refresh,
                 view,
                 id_type=None,
                 date=None):
        """Class intended as superclass to perform retrievals.

        Parameters
        ----------
        identifier : str or int
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AbstractRetrieval, AuthorRetrieval, CitationOverview,
            ContentAffiliationRetrieval.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        view : str
            The view of the file that should be downloaded.

        id_type : str (optional, default=None)
            The type of used ID.
            Note: Will only take effect for the AbstractRetrieval API.

        date : str (optional, default=None)
            A string combining two years with a hyphen for which citations
            should be looked up for.
            Note: Will only take effect for the CitationOverview API.

        Raises
        ------
        ValueError
            If the api parameter or view parameter is an invalid entry.
        """
        # Checks
        if api not in RETRIEVAL_URL:
            raise ValueError('api parameter must be one of ' +
                             ', '.join(RETRIEVAL_URL.keys()))

        # Construct parameters
        url = RETRIEVAL_URL[api]
        if api == "AbstractRetrieval":
            url += id_type + "/"
        params = {'view': view}
        if api == 'CitationOverview':
            params.update({
                'date': date,
                'scopus_id': identifier.split('0-')[-1]
            })
        url += identifier

        # Parse file contents
        qfile = join(get_folder(api, view), identifier.replace('/', '_'))
        res = get_content(qfile, refresh, url=url, params=params)
        self._json = loads(res.decode('utf-8'))
        self._view = view
Пример #6
0
    def __init__(self,
                 params: Dict,
                 url: str,
                 api: str,
                 download: bool = True,
                 verbose: bool = False,
                 *args: str,
                 **kwds: str) -> None:
        """Class intended as base class for superclasses.

        :param params: Dictionary used as header during the API request.
        :param url: The URL to be accessed.
        :param api: The Scopus API to be accessed.
        :param download: Whether to download the query or not.  Has no effect
                         for retrieval requests.
        :param verbose: Whether to print a download progress bar.
        :param args: Keywords passed on `get_content()`
        :param kwds: Keywords passed on `get_content()`

        Raises
        ------
        ValueError
            If `self._refresh` is neither boolean nor numeric.
        """
        # Checks
        try:
            _ = int(self._refresh)
        except ValueError:
            msg = "Parameter refresh needs to be numeric or boolean."
            raise ValueError(msg)

        # Compare age of file to test whether we refresh
        self._refresh, mod_ts = _check_file_age(self)

        # Read or download, possibly with caching
        fname = self._cache_file_path
        search_request = "query" in params
        if fname.exists() and not self._refresh:
            self._mdate = mod_ts
            if search_request:
                self._json = [
                    loads(line) for line in fname.read_text().split("\n")
                    if line
                ]
                self._n = len(self._json)
            else:
                self._json = loads(fname.read_text())
        else:
            resp = get_content(url, api, params, *args, **kwds)
            header = resp.headers
            if search_request:
                # Get number of results
                res = resp.json()
                n = int(res['search-results'].get('opensearch:totalResults',
                                                  0))
                self._n = n
                self._json = []
                # Results size check
                if params.get("cursor") is None and n > SEARCH_MAX_ENTRIES:
                    # Stop if there are too many results
                    text = f'Found {n} matches.  The query fails to return '\
                           f'more than {SEARCH_MAX_ENTRIES} entries.  Change '\
                           'your query such that it returns fewer entries.'
                    raise ScopusQueryError(text)
                # Download results page-wise
                if download:
                    data = ""
                    if n:
                        data, header = _parse(res, n, url, api, params,
                                              verbose, *args, **kwds)
                        self._json = data
                else:
                    data = None
            else:
                data = loads(resp.text)
                self._json = data
                data = [data]
            # Set private variables
            self._mdate = time()
            self._header = header
            # Finally write data unless download=False
            if download:
                text = [dumps(item, separators=(',', ':')) for item in data]
                fname.write_text("\n".join(text))