def session(self): if self._session is None: self._session = real_requests.Session() if CacheControlAdapter: adapter = CacheControlAdapter(cache=FileCache(".webcache")) self._session.mount("http://", adapter) self._session.mount("https://", adapter) print("Caching to .webcache") return self._session
def get_reader(self): sess = CacheControl(requests.Session(), cache=FileCache(tempfile.gettempdir())) req = sess.get(self.file) # if the response is not 200, an exception will be raised req.raise_for_status() return io.BufferedReader(io.BytesIO(req.content))
def get_cached_session(caching=True): if not caching: return requests.Session() return requests.Session( ) # For some reason, in concurrent environments CacheControl works quite bad. CACHE_DIR = 'web_cache' return CacheControl(requests.Session(), cache=FileCache(CACHE_DIR), heuristic=_LastModifiedNoDate(require_date=False))
def __init__(self, headers=None, cookies=None, cache_name=None, delay=1, expire_hours=12, as_string=False): ''' Base class for common scraping tasks Args: headers: dict of headers cookies: cookiejar object cache_name: should be full path delay: int (be polite!!!) expire_hours: int - default 4 as_string: get string rather than parsed json ''' logging.getLogger(__name__).addHandler(logging.NullHandler()) if not cookies: try: import cookielib cookies = cookielib.MozillaCookieJar() except (NameError, ImportError) as e: try: import http.cookiejar cookies = http.cookiejar.MozillaCookieJar() except Exception as e: pass _s = requests.Session() _s.cookies = cookies if headers: _s.headers.update(headers) else: _s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'}) if cache_name: if not '/' in cache_name: cache_name = os.path.join('/tmp', cache_name) try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount('http://', CacheControlAdapter(cache=FileCache(cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=expire_hours))) except ImportError as e: try: import requests_cache requests_cache.install_cache(cache_name) except: pass self.s = _s self.urls = [] self.as_string = as_string if delay > 0: self.delay = delay else: self.delay = None
def __init__(self, destination, staging, s3_url, dry_run, cache): self.destination = destination self.staging = staging self.s3_url = s3_url self.dry_run = dry_run if cache: self.info(f"Using cache {cache}") self.fetcher = CacheControl(requests.session(), cache=FileCache(cache)) else: self.info(f"Making uncached requests") self.fetcher = requests
def amalgama_lyrics(artist, song): url = amalgama.get_url(artist, song) try: cached_sess = CacheControl(sess, cache=FileCache('.amalgama')) response = cached_sess.get(url) response.raise_for_status() except requests.exceptions.HTTPError: print(f'{artist}-{song} not found in amalgama {url}') return None text = amalgama.get_html(response.text) return text
def __init__(self, ctx, schemagraph=None, foreign_properties=None, idx=None, cache=None, session=None): # type: (Loader.ContextType, rdflib.Graph, Set[unicode], Dict[unicode, Union[List, Dict[unicode, Any], unicode]], Dict[unicode, Any], requests.sessions.Session) -> None normalize = lambda url: urlparse.urlsplit(url).geturl() if idx is not None: self.idx = idx else: self.idx = NormDict(normalize) self.ctx = {} # type: Loader.ContextType if schemagraph is not None: self.graph = schemagraph else: self.graph = rdflib.graph.Graph() if foreign_properties is not None: self.foreign_properties = foreign_properties else: self.foreign_properties = set() if cache is not None: self.cache = cache else: self.cache = {} self.session = None # type: requests.sessions.Session if session is not None: self.session = session else: self.session = CacheControl(requests.Session(), cache=FileCache( os.path.join( os.environ["HOME"], ".cache", "salad"))) self.url_fields = None # type: Set[unicode] self.scoped_ref_fields = None # type: Dict[unicode, int] self.vocab_fields = None # type: Set[unicode] self.identifiers = None # type: Set[unicode] self.identity_links = None # type: Set[unicode] self.standalone = None # type: Set[unicode] self.nolinkcheck = None # type: Set[unicode] self.vocab = {} # type: Dict[unicode, unicode] self.rvocab = {} # type: Dict[unicode, unicode] self.idmap = None # type: Dict[unicode, Any] self.mapPredicate = None # type: Dict[unicode, unicode] self.type_dsl_fields = None # type: Set[unicode] self.add_context(ctx)
def __init__(self, destination: Path, staging: Path, s3_url: str, dry_run: bool, is_nightly_enabled: bool, cache: Optional[Path]): self.destination = destination self.staging = staging self.s3_url = s3_url self.dry_run = dry_run self.is_nightly_enabled = is_nightly_enabled if cache: self.info(f"Using cache {cache}") self.fetcher = CacheControl(requests.session(), cache=FileCache(cache)) else: self.info("Making uncached requests") self.fetcher = requests
def get_public_key(token): """ Because Google's public keys are only changed infrequently (on the order of once per day), we can take advantage of caching to reduce latency and the potential for network errors. """ sess = CacheControl(requests.Session(), cache=FileCache('/tmp/firebase-certs-cache')) request = sess.get(_CERT_URL) ks = request.json() keys = [] for k, v in ks.items(): keys.append({"alg": "RS256", "kid": k, "pem": v}) return search_for_key(token, keys)
def fetch(self): feed = None if InformantConfig().get_argv_clear_cache(): ui.debug_print('Clearing cache') fs.clear_cachefile() if InformantConfig().get_argv_use_cache(): ui.debug_print('Checking cache in {}'.format( InformantConfig().get_cachefile())) cachefile = InformantConfig().get_cachefile() os.umask( 0o0002 ) # unrestrict umask so we can cache with proper permissions try: session = CacheControl(requests.Session(), cache=FileCache(cachefile, filemode=0o0664, dirmode=0o0775)) feed = feedparser.parse(session.get(self.url).content) except Exception as e: ui.err_print('Unable to read cache information: {}'.format(e)) ui.debug_print('Falling back to fetching feed') feed = feedparser.parse(self.url) else: feed = feedparser.parse(self.url) if feed.bozo: e = feed.bozo_exception if isinstance(e, URLError): # most likely this is an internet issue (no connection) ui.warn_print('News could not be fetched for {}'.format( self.name if self.name is not None else self.url)) ui.debug_print('URLError: {}'.format(e.reason)) else: # I think this is most likely to be a malformed feed ui.err_print('Encountered feed error: {}'.format( feed.bozo_exception)) ui.debug_print('bozo message: {}'.format( feed.bozo_exception.getMessage())) # In either of these error cases we probably shouldn't return error # so the pacman hook won't hold up an operation. # Here return an empty set of entries in case only one of multiple # feeds failed to fetch try: feed = feedparser.util.FeedParserDict() feed.update({'entries': []}) except Exception as e: ui.err_print('Unexpected error: {}'.format(e)) sys.exit() return feed
def main(argv): sess = CacheControl(requests.Session(), cache=FileCache('.web_cache')) requests.get = sess.get resource_schema = tools.load(sys.argv[1]) apply_all_tweaks(resource_schema) if len(argv) == 3 and argv[2].endswith('json'): tools.write(resource_schema, argv[1]) else: print tools.print_(resource_schema) return 0
def __init__( self, fetcher: Optional[Fetcher] = None, namespaces: Optional[Dict[str, str]] = None, schemas: Optional[Dict[str, str]] = None, fileuri: Optional[str] = None, copyfrom: Optional["LoadingOptions"] = None, original_doc: Optional[Any] = None, ) -> None: """Create a LoadingOptions object.""" self.idx: Dict[str, Dict[str, Any]] = {} self.fileuri: Optional[str] = fileuri self.namespaces = namespaces self.schemas = schemas self.original_doc = original_doc if copyfrom is not None: self.idx = copyfrom.idx if fetcher is None: fetcher = copyfrom.fetcher if fileuri is None: self.fileuri = copyfrom.fileuri if namespaces is None: self.namespaces = copyfrom.namespaces if schemas is None: self.schemas = copyfrom.schemas if fetcher is None: import requests from cachecontrol.caches import FileCache from cachecontrol.wrapper import CacheControl root = pathlib.Path(os.environ.get("HOME", tempfile.gettempdir())) session = CacheControl( requests.Session(), cache=FileCache(root / ".cache" / "salad"), ) self.fetcher: Fetcher = DefaultFetcher({}, session) else: self.fetcher = fetcher self.vocab = _vocab self.rvocab = _rvocab if namespaces is not None: self.vocab = self.vocab.copy() self.rvocab = self.rvocab.copy() for k, v in namespaces.items(): self.vocab[k] = v self.rvocab[v] = k
def requests_session() -> requests.Session: """Creates a Requests-Cache session object.""" global _session if _session is not None: return _session cache_name = cache_directory('blender_cloud_http') log.info('Storing cache in %s' % cache_name) _session = cachecontrol.CacheControl(sess=requests.session(), cache=FileCache(cache_name)) return _session
def get_cached_session(): sess = CacheControl(requests.Session(), cache=FileCache(CACHE_DIR), heuristic=LastModifiedNoDate(require_date=False)) original_get = sess.get def wrapped_get(*args, **kwargs): try: return original_get(*args, **kwargs) except (OSError, IOError) as e: return requests.get(*args, **kwargs) sess.get = wrapped_get return sess
def __init__( self, fetcher=None, # type: Optional[Fetcher] namespaces=None, # type: Optional[Dict[str, str]] schemas=None, # type: Optional[Dict[str, str]] fileuri=None, # type: Optional[str] copyfrom=None, # type: Optional[LoadingOptions] original_doc=None, # type: Optional[Any] ): # type: (...) -> None self.idx = {} # type: Dict[str, Dict[str, Any]] self.fileuri = fileuri # type: Optional[str] self.namespaces = namespaces self.schemas = schemas self.original_doc = original_doc if copyfrom is not None: self.idx = copyfrom.idx if fetcher is None: fetcher = copyfrom.fetcher if fileuri is None: self.fileuri = copyfrom.fileuri if namespaces is None: self.namespaces = copyfrom.namespaces if schemas is None: self.schemas = copyfrom.schemas if fetcher is None: import requests from cachecontrol.caches import FileCache from cachecontrol.wrapper import CacheControl root = pathlib.Path(os.environ.get("HOME", tempfile.gettempdir())) session = CacheControl( requests.Session(), cache=FileCache(root / ".cache" / "salad"), ) self.fetcher: Fetcher = DefaultFetcher({}, session) else: self.fetcher = fetcher self.vocab = _vocab self.rvocab = _rvocab if namespaces is not None: self.vocab = self.vocab.copy() self.rvocab = self.rvocab.copy() for k, v in namespaces.items(): self.vocab[k] = v self.rvocab[v] = k
def make_session(scraper): """ Instantiate a session with the desired configuration parameters, including the cache policy. """ cache_path = os.path.join(scraper.config.data_path, 'cache') cache_policy = scraper.config.cache_policy cache_policy = cache_policy.lower().strip() session = ScraperSession() session.scraper = scraper session.cache_policy = cache_policy adapter = CacheControlAdapter(FileCache(cache_path), cache_etags=True, controller_class=PolicyCacheController) session.mount('http://', adapter) session.mount('https://', adapter) return session
def __init__( self, config: Config | None = None, io: IO | None = None, cache_id: str | None = None, disable_cache: bool = False, ) -> None: self._config = config or Config.create() self._io = io self._sessions_for_netloc: dict[str, requests.Session] = {} self._credentials: dict[str, HTTPAuthCredential] = {} self._certs: dict[str, dict[str, Path | None]] = {} self._configured_repositories: dict[ str, AuthenticatorRepositoryConfig] | None = None self._password_manager = PasswordManager(self._config) self._cache_control = (FileCache( str(REPOSITORY_CACHE_DIR / (cache_id or "_default_cache") / "_http")) if not disable_cache else None)
def fromParameters( cls, sessionFactory: Callable[[], requests.Session], cachePath: str, maxAgeDictionary: Mapping[str, int]) -> 'IntersphinxCache': """ Construct an instance with the given parameters. @param sessionFactory: A zero-argument L{callable} that returns a L{requests.Session}. @param cachePath: Path of the cache directory. @param maxAgeDictionary: A mapping describing the maximum age of any cache entry. @see: L{parseMaxAge} """ session = CacheControl(sessionFactory(), cache=FileCache(cachePath), heuristic=ExpiresAfter(**maxAgeDictionary)) return cls(session)
def _get_filehandle(filepath_or, *args, **kwargs): """Open file if `filepath_or` looks like a string/unicode/bytes, else pass through. """ if _is_string_or_bytes(filepath_or): if requests.compat.urlparse(filepath_or).scheme in {'http', 'https'}: sess = CacheControl(requests.Session(), cache=FileCache(gettempdir())) req = sess.get(filepath_or, **kwargs) # if the response is not 200, an exception will be raised req.raise_for_status() fh, own_fh = BytesIO(req.content), True else: fh, own_fh = open(filepath_or, *args, **kwargs), True else: fh, own_fh = filepath_or, False return fh, own_fh
def __init__(self, engine, **kwargs): self.id = hashlib.sha1( six.b('{0}:{1!r}'.format(clsname(self), kwargs.get('__init_args__', {})))).hexdigest() if engine: self.engine = engine else: self.engine = busbus.Engine() self.engine._register_provider(self) self._requests = requests.Session() # This requests session object, wrapped with CacheControl, is useful # for long-term storage of larger files, such as GTFS data. self._cached_requests = CacheControl( self._requests, cache=FileCache(self.engine.config['url_cache_dir']))
def get_feed(feed_url): """ Return feed parsed feed """ requests_timeout = getattr(settings, 'FEED_TIMOUT', 1) cache_adapter = CacheControlAdapter( cache=FileCache('.web_cache'), heuristic=ExpiresAfter(hours=1), ) session = requests.Session() session.mount('http://', cache_adapter) session.mount('https://', cache_adapter) show_exceptions = getattr(settings, 'DEBUG', True) feed_request = session.get(feed_url, timeout=requests_timeout) return feedparser.parse(feed_request.text)
def __init__(self, api_key=None, locale=None, anonymize=False, exclude_episodes=False, user_agent=None, cache=None, proxy_uri=None, verify_ssl=True, session=None): self.api_key = api_key or SHA1_KEY self.timestamp = time.mktime(datetime.date.today().timetuple()) self.user_agent = user_agent or random.choice(USER_AGENTS) self.locale = locale or 'en_US' self.exclude_episodes = exclude_episodes self.caching_enabled = True if cache is True else False self.proxy_uri = proxy_uri or DEFAULT_PROXY_URI self.anonymize = anonymize self.verify_ssl = verify_ssl self.session = session or requests.Session() if self.caching_enabled: warnings.warn('caching will be removed in version 5.0.0 ' 'due to not being thread safe') self.session = CacheControl( self.session, cache=FileCache('.imdbpie_cache') )
def fromParameters(cls, sessionFactory, cachePath, maxAgeDictionary): """ Construct an instance with the given parameters. @param sessionFactory: A zero-argument L{callable} that returns a L{requests.Session}. @param cachePath: Path of the cache directory. @type cachePath: L{str} @param maxAgeDictionary: A dictionary describing the maximum age of any cache entry. @type maxAgeDictionary: L{dict} @see: L{parseMaxAge} """ session = CacheControl(sessionFactory(), cache=FileCache(cachePath), heuristic=ExpiresAfter(**maxAgeDictionary)) return cls(session)
def __init__( self, config: Config | None = None, io: IO | None = None, cache_id: str | None = None, disable_cache: bool = False, ) -> None: self._config = config or Config.create() self._io = io self._sessions_for_netloc: dict[str, requests.Session] = {} self._credentials: dict[str, HTTPAuthCredential] = {} self._certs: dict[str, RepositoryCertificateConfig] = {} self._configured_repositories: dict[ str, AuthenticatorRepositoryConfig] | None = None self._password_manager = PasswordManager(self._config) self._cache_control = (FileCache( str(self._config.repository_cache_directory / (cache_id or "_default_cache") / "_http")) if not disable_cache else None) self.get_repository_config_for_url = functools.lru_cache(maxsize=None)( self._get_repository_config_for_url)
def get_session(): CACHE_FOLDER.mkdir(exist_ok=True) cache = FileCache(str(CACHE_FOLDER), forever=True) cache.set("foo", b"bar") assert cache.get("foo") == b"bar" session = RateLimitingSession() # session.headers.update({"x-api-key": "something-something-darkside"}) session.mount( "https://www.metlink.org.nz/", CacheControlAdapter(heuristic=BetterExpiresAfter(days=7), cache=cache), ) session.mount( METLINK_API_URL_PREFIX, CacheControlAdapter(heuristic=BetterExpiresAfter(days=1), cache=cache), ) session.mount( METLINK_API_URL_PREFIX + "ServiceLocation/", CacheControlAdapter(heuristic=BetterExpiresAfter(seconds=90), cache=cache), ) return session
class Settings: do_update_wikidata = True # Read also tags if a project doesn't use githubs releases read_tags = True normalize_repo_url = True blacklist_page = "User:Github-wiki-bot/Exceptions" whitelist_page = "User:Github-wiki-bot/Whitelist" blacklist: List[str] = [] whitelist: List[str] = [] sparql_file = "free_software_items.rq" # pywikibot is too stupid to cache the calendar model, so let's do this manually calendarmodel = pywikibot.Site().data_repository().calendarmodel() wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository() repo_regex = re.compile(r"^[a-z]+://github.com/[^/]+/[^/]+/?$") cached_session: requests.Session = CacheControl(requests.Session(), cache=FileCache("cache"))
def downloadHttpFile(httpurl): # type: (Text) -> Text cache_session = None if "XDG_CACHE_HOME" in os.environ: directory = os.environ["XDG_CACHE_HOME"] elif "HOME" in os.environ: directory = os.environ["HOME"] else: directory = os.path.expanduser('~') cache_session = CacheControl( requests.Session(), cache=FileCache( os.path.join(directory, ".cache", "cwltool"))) r = cache_session.get(httpurl, stream=True) with NamedTemporaryFile(mode='wb', delete=False) as f: for chunk in r.iter_content(chunk_size=16384): if chunk: # filter out keep-alive new chunks f.write(chunk) r.close() return f.name
def __init__(self, redis_connection=None, fallback_cache_duration=5, file_cache_directory=".webcache", timeout=(0.5, 3), *args, **kwargs): super(CachedSession, self).__init__(*args, **kwargs) heuristic = ExpiresAfterIfNoCacheControl( seconds=fallback_cache_duration) cache = FileCache(file_cache_directory) if redis_connection: cache = RedisCache(redis_connection) adapter = CacheAdapterWithTimeout(heuristic=heuristic, cache=cache, timeout=timeout) self.mount("http://", adapter) self.mount("https://", adapter)
def open(self): global SESSION if SESSION is None: SESSION = CacheControl(Session(), cache=FileCache(SESSION_CACHE_PATH)) try: self._response = SESSION.get(self.uri, headers=self.headers) except ConnectionError as e: raise LoaderException('request connection error: "%s"' % self.uri, cause=e) except Exception as e: raise LoaderException('request error: "%s"' % self.uri, cause=e) status = self._response.status_code if status == 404: self._response = None raise DocumentNotFoundException('document not found: "%s"' % self.uri) elif status != 200: self._response = None raise LoaderException('request error %d: "%s"' % (status, self.uri))
def __init__(self, api_key=None, locale=None, anonymize=None, exclude_episodes=None, user_agent=None, cache=None, proxy_uri=None, verify_ssl=None): self.api_key = api_key or SHA1_KEY self.timestamp = time.mktime(datetime.date.today().timetuple()) self.user_agent = user_agent or random.choice(USER_AGENTS) self.locale = locale or 'en_US' self.exclude_episodes = True if exclude_episodes is True else False self.caching_enabled = True if cache is True else False self.proxy_uri = proxy_uri or DEFAULT_PROXY_URI self.anonymize = False or anonymize self.verify_ssl = True or verify_ssl self.session = requests if self.caching_enabled: self.session = CacheControl(requests.Session(), cache=FileCache('.imdbpie_cache'))