def download_epmc(download_path, year=2020): session = requests.Session() retries = Retry(**RETRY_PARAMETERS) retries.BACKOFF_MAX = RETRY_BACKOFF_MAX session.mount("https://", HTTPAdapter(max_retries=retries)) year_path = os.path.join(download_path, str(year)) os.makedirs(year_path, exist_ok=True) for month in range(12): month = f"{month+1:02}" month_path = os.path.join(year_path, f"{month}.jsonl") if os.path.exists(month_path): print( f"Skipping because {month_path} exists. Delete if you want to redownload" ) continue tmp_month_path = f"{month_path}.tmp" with open(tmp_month_path, "w") as f: params = { "query": f"(FIRST_PDATE:[{year}-{month}-01 TO {year}-{month}-31])", "format": "json", "resultType": "core", "pageSize": 100, } hit_count = get_hit_count(session, params) for result in tqdm( yield_results(session, params), total=hit_count, desc=f"Year {year} Month {month}", ): f.write(json.dumps(result)) f.write("\n") os.rename(tmp_month_path, month_path)
def requests_session(self, adapter_kwargs=None): """ Returns a requests session with retry params in place, and configured for a single persistent connection (since we generally expect one session per thread). """ session = requests.Session() retries = Retry(**RETRY_PARAMETERS) retries.BACKOFF_MAX = RETRY_BACKOFF_MAX if adapter_kwargs is None: adapter_kwargs = {} session.mount('https://', HTTPAdapter(max_retries=retries, **adapter_kwargs)) return session
def get_requests_session(auth=False): """ Create a requests session with retries configured. :return: the configured requests session :rtype: requests.Session """ session = requests.Session() retry = Retry( total=3, read=3, connect=3, backoff_factor=0.5, status_forcelist=(500, 502, 503, 504), ) retry.BACKOFF_MAX = 2 adapter = requests.adapters.HTTPAdapter(max_retries=retry) session.mount("http://", adapter) session.mount("https://", adapter) return session