def create_http_session(hostname): sess = requests.Session() cache = FileCache('.webcache') basic_adapter = CacheControlAdapter(cache=cache) forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) sess.mount('http://', basic_adapter) sess.mount('https://', basic_adapter) sess.mount('http://www.' + hostname, forever_adapter) sess.mount('https://www.' + hostname, forever_adapter) return sess
def __init__(self, main_source_domain=None, start_page=None): if main_source_domain is None and start_page is None: raise ValueError( 'Need to specify main_source_domain or start_page.') if main_source_domain: self.MAIN_SOURCE_DOMAIN = main_source_domain.rstrip('/') self.START_PAGE = self.MAIN_SOURCE_DOMAIN if self.MAIN_SOURCE_DOMAIN is None: self.MAIN_SOURCE_DOMAIN = urlparse(start_page).netloc if self.MAIN_SOURCE_DOMAIN not in self.SOURCE_DOMAINS: self.SOURCE_DOMAINS.append(self.MAIN_SOURCE_DOMAIN) if start_page: self.START_PAGE = start_page # keep track of broken links self.broken_links = [] forever_adapter = CacheControlAdapter( heuristic=CacheForeverHeuristic(), cache=self.CACHE) for source_domain in self.SOURCE_DOMAINS: self.SESSION.mount( source_domain, forever_adapter ) # TODO: change to less aggressive in final version
import requests import time from selenium import webdriver from requests_file import FileAdapter from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter, InvalidatingCacheControlAdapter DOWNLOAD_SESSION = requests.Session( ) # Session for downloading content from urls DOWNLOAD_SESSION.mount('https://', requests.adapters.HTTPAdapter(max_retries=3)) DOWNLOAD_SESSION.mount('file://', FileAdapter()) cache = FileCache('.webcache') forever_adapter = CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache) DOWNLOAD_SESSION.mount('http://', forever_adapter) DOWNLOAD_SESSION.mount('https://', forever_adapter) def read(path, loadjs=False, session=None, driver=None): """ read: Reads from source and returns contents Args: path: (str) url or local path to download loadjs: (boolean) indicates whether to load js (optional) session: (requests.Session) session to use to download (optional) driver: (selenium.webdriver) webdriver to use to download (optional) Returns: str content from file or page """ session = session or DOWNLOAD_SESSION try: if loadjs: # Wait until js loads then return contents