def query_journal(source_id, years, refresh): """Get authors by year for a particular source. Parameters ---------- source_id : str or int The Scopus ID of the source. years : container of int or container of str The relevant pulication years to search for. refresh : bool (optional) Whether to refresh cached files if they exist, or not. Returns ------- d : dict Dictionary keyed by year listing all authors who published in that year. """ try: # Try complete publication list first q = "SOURCE-ID({})".format(source_id) if base_query("docs", q, size_only=True) > 5000: raise ScopusQueryError() res = base_query("docs", q, refresh=refresh) except (ScopusQueryError, Scopus500Error): # Fall back to year-wise queries res = [] for year in years: q = Template( "SOURCE-ID({}) AND PUBYEAR IS $fill".format(source_id)) params = { "group": [year], "res": [], "template": q, "joiner": "", "q_type": "docs", "refresh": refresh } ext, _ = stacked_query(**params) if not valid_results(ext): # Reload queries with missing years params.update({"refresh": True}) ext, _ = stacked_query(**params) res.extend(ext) # Sort authors by year d = defaultdict(list) for pub in res: try: year = pub.coverDate[:4] except TypeError: # missing year continue d[year].extend(get_authors([pub])) # Populate dict return d
def __init__(self, fname, refresh, params, url, download=None, max_entries=None, verbose=False, *args, **kwds): """Class intended as base class for superclasses. Parameters ---------- fname : str The filename (including path) of the cache object. refresh : bool or int Whether to refresh the cached file if it exists or not. If int is passed, cached file will be refreshed if the number of days since last modification exceeds that value. params : dict Dictionary used as header during the API request. url : str The URL to be accessed. download : bool (optional, default=None) Whether to download the query or not. Has no effect for retrieval requests. max_entries : int (optional, default=None) Raise error when the number of results is beyond this number. Has no effect for retrieval requests. verbose : bool (optional, default=False) Whether to print a progress bar for multip-page requests. *args, **kwds Arguments and key-value pairings to be passed on to `get_content()`. Raises ------ ScopusQueryError If `refresh` is neither boolean nor numeric. ValueError If `refresh` is neither boolean nor numeric. """ # Compare age of file to test whether we refresh refresh, exists, mod_ts = _check_file_age(fname, refresh) # Read or dowload eventually with caching search_request = "query" in params if exists and not refresh: self._mdate = mod_ts if search_request: with open(fname, 'rb') as f: self._json = [loads(line) for line in f.readlines()] self._n = len(self._json) else: with open(fname, 'rb') as f: self._json = loads(f.read().decode('utf-8')) else: resp = get_content(url, params, *args, **kwds) header = resp.headers if search_request: # Download results page-wise res = resp.json() n = int(res['search-results'].get('opensearch:totalResults', 0)) self._n = n self._json = [] data = "".encode('utf-8') cursor_false = "cursor" in params and not params["cursor"] if cursor_false and n > max_entries: # Stop if there are too many results text = (f'Found {n} matches. Set max_entries to a higher ' f'number, change your query ({query}) or set ' 'subscription=True') raise ScopusQueryError(text) if n and download: data, header = _parse(res, n, url, params, verbose, *args, **kwds) self._json = data else: data = resp.text.encode('utf-8') self._json = loads(data) # Set private variables self._mdate = time() self._header = header # Finally write data _write_json(fname, data)
def __init__(self, query, api, refresh, view='STANDARD', count=200, max_entries=5000, cursor=False, download_results=True, **kwds): """Class intended as superclass to perform a search query. Parameters ---------- query : str A string of the query. api : str The name of the Scopus API to be accessed. Allowed values: AffiliationSearch, AuthorSearch, ScopusSearch. refresh : bool Whether to refresh the cached file if it exists or not. view : str The view of the file that should be downloaded. count : int (optional, default=200) The number of entries to be displayed at once. A smaller number means more queries with each query having less results. max_entries : int (optional, default=5000) Raise error when the number of results is beyond this number. To skip this check, set `max_entries` to `None`. cursor : str (optional, default=False) Whether to use the cursor in order to iterate over all search results without limit on the number of the results. In contrast to `start` parameter, the `cursor` parameter does not allow users to obtain partial results. download_results : bool (optional, default=True) Whether to download results (if they have not been cached) or not. kwds : key-value parings, optional Keywords passed on to requests header. Must contain fields and values specified in the respective API specification. Raises ------ ScopusQueryError If the number of search results exceeds max_entries. ValueError If the api parameteris an invalid entry. """ # Read the file contents if file exists and we are not refreshing, # otherwise download query anew and cache file fname = md5(query.encode('utf8')).hexdigest() qfile = join(get_folder(api, view), fname) if not refresh and exists(qfile): with open(qfile, "rb") as f: self._json = [loads(line) for line in f.readlines()] self._n = len(self._json) else: # Set query parameters params = {'query': query, 'count': count, 'view': view} if cursor: params.update({'cursor': '*'}) else: params.update({'start': 0}) # Download results res = download(url=SEARCH_URL[api], params=params, **kwds).json() n = int(res['search-results'].get('opensearch:totalResults', 0)) self._n = n if not cursor and n > max_entries: # Stop if there are too many results text = ('Found {} matches. Set max_entries to a higher ' 'number, change your query ({}) or set ' 'subscription=True'.format(n, query)) raise ScopusQueryError(text) if download_results: self._json = _parse(res, params, n, api, **kwds) # Finally write out the file with open(qfile, 'wb') as f: for item in self._json: f.write('{}\n'.format(dumps(item)).encode('utf-8')) else: # Assures that properties will not result in an error self._json = [] self._view = view
def __init__(self, params: Dict, url: str, api: str, download: bool = True, verbose: bool = False, *args: str, **kwds: str) -> None: """Class intended as base class for superclasses. :param params: Dictionary used as header during the API request. :param url: The URL to be accessed. :param api: The Scopus API to be accessed. :param download: Whether to download the query or not. Has no effect for retrieval requests. :param verbose: Whether to print a download progress bar. :param args: Keywords passed on `get_content()` :param kwds: Keywords passed on `get_content()` Raises ------ ValueError If `self._refresh` is neither boolean nor numeric. """ # Checks try: _ = int(self._refresh) except ValueError: msg = "Parameter refresh needs to be numeric or boolean." raise ValueError(msg) # Compare age of file to test whether we refresh self._refresh, mod_ts = _check_file_age(self) # Read or download, possibly with caching fname = self._cache_file_path search_request = "query" in params if fname.exists() and not self._refresh: self._mdate = mod_ts if search_request: self._json = [ loads(line) for line in fname.read_text().split("\n") if line ] self._n = len(self._json) else: self._json = loads(fname.read_text()) else: resp = get_content(url, api, params, *args, **kwds) header = resp.headers if search_request: # Get number of results res = resp.json() n = int(res['search-results'].get('opensearch:totalResults', 0)) self._n = n self._json = [] # Results size check if params.get("cursor") is None and n > SEARCH_MAX_ENTRIES: # Stop if there are too many results text = f'Found {n} matches. The query fails to return '\ f'more than {SEARCH_MAX_ENTRIES} entries. Change '\ 'your query such that it returns fewer entries.' raise ScopusQueryError(text) # Download results page-wise if download: data = "" if n: data, header = _parse(res, n, url, api, params, verbose, *args, **kwds) self._json = data else: data = None else: data = loads(resp.text) self._json = data data = [data] # Set private variables self._mdate = time() self._header = header # Finally write data unless download=False if download: text = [dumps(item, separators=(',', ':')) for item in data] fname.write_text("\n".join(text))
def stacked_query(group, res, template, joiner, q_type, refresh, i=0, total=None): """Auxiliary function to recursively perform queries until they work. Parameters ---------- group : list of str Scopus IDs (of authors or sources) for which the stacked query should be conducted. res : list (Initially empty )Container to which the query results will be appended. template : Template() A string template with one paramter named `fill` which will be used as search query. joiner : str On wich the group elements should be joined to fill the query. q_type : str Determines the query search that will be used. Allowed values: "author", "docs". refresh : bool Whether the cached files should be refreshed or not. i : int (optional, default=0) A count variable to be used for printing the progress bar. total : int (optional, default=None) The total number of elements in the group. If provided, a progress bar will be printed. Returns ------- res : list A list of namedtuples representing publications. i : int A running variable to indicate the progress. Notes ----- Results of each successful query are appended to ´res´. """ group = [str(g) for g in group] # make robust to passing int q = template.substitute(fill=joiner.join(group)) try: n = base_query(q_type, q, size_only=True) if n > 5000 and len(group) > 1: raise ScopusQueryError() res.extend(base_query(q_type, q, refresh=refresh)) verbose = total is not None i += len(group) print_progress(i, total, verbose) except (Scopus400Error, Scopus500Error, ScopusQueryError) as e: # Split query group into two equally sized groups mid = len(group) // 2 params = { "group": group[:mid], "res": res, "template": template, "i": i, "joiner": joiner, "q_type": q_type, "total": total, "refresh": refresh } res, i = stacked_query(**params) params.update({"group": group[mid:], "i": i}) res, i = stacked_query(**params) return res, i