예제 #1
0
파일: queries.py 프로젝트: fagan2888/sosia
def query_journal(source_id, years, refresh):
    """Get authors by year for a particular source.

    Parameters
    ----------
    source_id : str or int
        The Scopus ID of the source.

    years : container of int or container of str
        The relevant pulication years to search for.

    refresh : bool (optional)
        Whether to refresh cached files if they exist, or not.

    Returns
    -------
    d : dict
        Dictionary keyed by year listing all authors who published in
        that year.
    """
    try:  # Try complete publication list first
        q = "SOURCE-ID({})".format(source_id)
        if base_query("docs", q, size_only=True) > 5000:
            raise ScopusQueryError()
        res = base_query("docs", q, refresh=refresh)
    except (ScopusQueryError,
            Scopus500Error):  # Fall back to year-wise queries
        res = []
        for year in years:
            q = Template(
                "SOURCE-ID({}) AND PUBYEAR IS $fill".format(source_id))
            params = {
                "group": [year],
                "res": [],
                "template": q,
                "joiner": "",
                "q_type": "docs",
                "refresh": refresh
            }
            ext, _ = stacked_query(**params)
            if not valid_results(ext):  # Reload queries with missing years
                params.update({"refresh": True})
                ext, _ = stacked_query(**params)
            res.extend(ext)
    # Sort authors by year
    d = defaultdict(list)
    for pub in res:
        try:
            year = pub.coverDate[:4]
        except TypeError:  # missing year
            continue
        d[year].extend(get_authors([pub]))  # Populate dict
    return d
예제 #2
0
    def __init__(self,
                 fname,
                 refresh,
                 params,
                 url,
                 download=None,
                 max_entries=None,
                 verbose=False,
                 *args,
                 **kwds):
        """Class intended as base class for superclasses.

        Parameters
        ----------
        fname : str
            The filename (including path) of the cache object.

        refresh : bool or int
            Whether to refresh the cached file if it exists or not.  If int
            is passed, cached file will be refreshed if the number of days
            since last modification exceeds that value.

        params : dict
            Dictionary used as header during the API request.

        url : str
            The URL to be accessed.

        download : bool (optional, default=None)
            Whether to download the query or not.  Has no effect for
            retrieval requests.

        max_entries : int (optional, default=None)
            Raise error when the number of results is beyond this number.
            Has no effect for retrieval requests.

        verbose : bool (optional, default=False)
            Whether to print a progress bar for multip-page requests.

        *args, **kwds
            Arguments and key-value pairings to be passed on
            to `get_content()`.

        Raises
        ------
        ScopusQueryError
            If `refresh` is neither boolean nor numeric.

        ValueError
            If `refresh` is neither boolean nor numeric.
        """
        # Compare age of file to test whether we refresh
        refresh, exists, mod_ts = _check_file_age(fname, refresh)

        # Read or dowload eventually with caching
        search_request = "query" in params
        if exists and not refresh:
            self._mdate = mod_ts
            if search_request:
                with open(fname, 'rb') as f:
                    self._json = [loads(line) for line in f.readlines()]
                self._n = len(self._json)
            else:
                with open(fname, 'rb') as f:
                    self._json = loads(f.read().decode('utf-8'))
        else:
            resp = get_content(url, params, *args, **kwds)
            header = resp.headers
            if search_request:
                # Download results page-wise
                res = resp.json()
                n = int(res['search-results'].get('opensearch:totalResults',
                                                  0))
                self._n = n
                self._json = []
                data = "".encode('utf-8')
                cursor_false = "cursor" in params and not params["cursor"]
                if cursor_false and n > max_entries:
                    # Stop if there are too many results
                    text = (f'Found {n} matches. Set max_entries to a higher '
                            f'number, change your query ({query}) or set '
                            'subscription=True')
                    raise ScopusQueryError(text)
                if n and download:
                    data, header = _parse(res, n, url, params, verbose, *args,
                                          **kwds)
                    self._json = data
            else:
                data = resp.text.encode('utf-8')
                self._json = loads(data)
            # Set private variables
            self._mdate = time()
            self._header = header
            # Finally write data
            _write_json(fname, data)
예제 #3
0
    def __init__(self, query, api, refresh, view='STANDARD', count=200,
                 max_entries=5000, cursor=False, download_results=True, **kwds):
        """Class intended as superclass to perform a search query.

        Parameters
        ----------
        query : str
            A string of the query.

        api : str
            The name of the Scopus API to be accessed.  Allowed values:
            AffiliationSearch, AuthorSearch, ScopusSearch.

        refresh : bool
            Whether to refresh the cached file if it exists or not.

        view : str
            The view of the file that should be downloaded.

        count : int (optional, default=200)
            The number of entries to be displayed at once.  A smaller number
            means more queries with each query having less results.

        max_entries : int (optional, default=5000)
            Raise error when the number of results is beyond this number.
            To skip this check, set `max_entries` to `None`.


        cursor : str (optional, default=False)
            Whether to use the cursor in order to iterate over all search
            results without limit on the number of the results.  In contrast
            to `start` parameter, the `cursor` parameter does not allow users
            to obtain partial results.

        download_results : bool (optional, default=True)
            Whether to download results (if they have not been cached) or not.

        kwds : key-value parings, optional
            Keywords passed on to requests header.  Must contain fields
            and values specified in the respective API specification.

        Raises
        ------
        ScopusQueryError
            If the number of search results exceeds max_entries.

        ValueError
            If the api parameteris an invalid entry.
        """
        # Read the file contents if file exists and we are not refreshing,
        # otherwise download query anew and cache file
        fname = md5(query.encode('utf8')).hexdigest()
        qfile = join(get_folder(api, view), fname)
        if not refresh and exists(qfile):
            with open(qfile, "rb") as f:
                self._json = [loads(line) for line in f.readlines()]
            self._n = len(self._json)
        else:
            # Set query parameters
            params = {'query': query, 'count': count, 'view': view}
            if cursor:
                params.update({'cursor': '*'})
            else:
                params.update({'start': 0})
            # Download results
            res = download(url=SEARCH_URL[api], params=params, **kwds).json()
            n = int(res['search-results'].get('opensearch:totalResults', 0))
            self._n = n
            if not cursor and n > max_entries:  # Stop if there are too many results
                text = ('Found {} matches. Set max_entries to a higher '
                        'number, change your query ({}) or set '
                        'subscription=True'.format(n, query))
                raise ScopusQueryError(text)
            if download_results:
                self._json = _parse(res, params, n, api, **kwds)
                # Finally write out the file
                with open(qfile, 'wb') as f:
                    for item in self._json:
                        f.write('{}\n'.format(dumps(item)).encode('utf-8'))
            else:
                # Assures that properties will not result in an error
                self._json = []
        self._view = view
예제 #4
0
    def __init__(self,
                 params: Dict,
                 url: str,
                 api: str,
                 download: bool = True,
                 verbose: bool = False,
                 *args: str,
                 **kwds: str) -> None:
        """Class intended as base class for superclasses.

        :param params: Dictionary used as header during the API request.
        :param url: The URL to be accessed.
        :param api: The Scopus API to be accessed.
        :param download: Whether to download the query or not.  Has no effect
                         for retrieval requests.
        :param verbose: Whether to print a download progress bar.
        :param args: Keywords passed on `get_content()`
        :param kwds: Keywords passed on `get_content()`

        Raises
        ------
        ValueError
            If `self._refresh` is neither boolean nor numeric.
        """
        # Checks
        try:
            _ = int(self._refresh)
        except ValueError:
            msg = "Parameter refresh needs to be numeric or boolean."
            raise ValueError(msg)

        # Compare age of file to test whether we refresh
        self._refresh, mod_ts = _check_file_age(self)

        # Read or download, possibly with caching
        fname = self._cache_file_path
        search_request = "query" in params
        if fname.exists() and not self._refresh:
            self._mdate = mod_ts
            if search_request:
                self._json = [
                    loads(line) for line in fname.read_text().split("\n")
                    if line
                ]
                self._n = len(self._json)
            else:
                self._json = loads(fname.read_text())
        else:
            resp = get_content(url, api, params, *args, **kwds)
            header = resp.headers
            if search_request:
                # Get number of results
                res = resp.json()
                n = int(res['search-results'].get('opensearch:totalResults',
                                                  0))
                self._n = n
                self._json = []
                # Results size check
                if params.get("cursor") is None and n > SEARCH_MAX_ENTRIES:
                    # Stop if there are too many results
                    text = f'Found {n} matches.  The query fails to return '\
                           f'more than {SEARCH_MAX_ENTRIES} entries.  Change '\
                           'your query such that it returns fewer entries.'
                    raise ScopusQueryError(text)
                # Download results page-wise
                if download:
                    data = ""
                    if n:
                        data, header = _parse(res, n, url, api, params,
                                              verbose, *args, **kwds)
                        self._json = data
                else:
                    data = None
            else:
                data = loads(resp.text)
                self._json = data
                data = [data]
            # Set private variables
            self._mdate = time()
            self._header = header
            # Finally write data unless download=False
            if download:
                text = [dumps(item, separators=(',', ':')) for item in data]
                fname.write_text("\n".join(text))
예제 #5
0
파일: queries.py 프로젝트: fagan2888/sosia
def stacked_query(group,
                  res,
                  template,
                  joiner,
                  q_type,
                  refresh,
                  i=0,
                  total=None):
    """Auxiliary function to recursively perform queries until they work.

    Parameters
    ----------
    group : list of str
        Scopus IDs (of authors or sources) for which the stacked query should
        be conducted.

    res : list
        (Initially empty )Container to which the query results will be
        appended.

    template : Template()
        A string template with one paramter named `fill` which will be used
        as search query.

    joiner : str
        On wich the group elements should be joined to fill the query.

    q_type : str
        Determines the query search that will be used.  Allowed values:
        "author", "docs".

    refresh : bool
        Whether the cached files should be refreshed or not.

    i : int (optional, default=0)
        A count variable to be used for printing the progress bar.

    total : int (optional, default=None)
        The total number of elements in the group.  If provided, a progress
        bar will be printed.

    Returns
    -------
    res : list
        A list of namedtuples representing publications.

    i : int
        A running variable to indicate the progress.

    Notes
    -----
    Results of each successful query are appended to ´res´.
    """
    group = [str(g) for g in group]  # make robust to passing int
    q = template.substitute(fill=joiner.join(group))
    try:
        n = base_query(q_type, q, size_only=True)
        if n > 5000 and len(group) > 1:
            raise ScopusQueryError()
        res.extend(base_query(q_type, q, refresh=refresh))
        verbose = total is not None
        i += len(group)
        print_progress(i, total, verbose)
    except (Scopus400Error, Scopus500Error, ScopusQueryError) as e:
        # Split query group into two equally sized groups
        mid = len(group) // 2
        params = {
            "group": group[:mid],
            "res": res,
            "template": template,
            "i": i,
            "joiner": joiner,
            "q_type": q_type,
            "total": total,
            "refresh": refresh
        }
        res, i = stacked_query(**params)
        params.update({"group": group[mid:], "i": i})
        res, i = stacked_query(**params)
    return res, i