def __init__(self, url="https://pypi.org/", disable_cache=False, fallback=True): super(PyPiRepository, self).__init__(url.rstrip("/") + "/simple/") self._base_url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = REPOSITORY_CACHE_DIR / "pypi" self._cache = CacheManager({ "default": "releases", "serializer": "json", "stores": { "releases": { "driver": "file", "path": str(release_cache_dir) }, "packages": { "driver": "dict" }, }, }) self._cache_control_cache = FileCache(str(release_cache_dir / "_http")) self._session = CacheControl(requests.session(), cache=self._cache_control_cache) self._name = "PyPI"
def __init__(self, url="https://pypi.org/", disable_cache=False, fallback=True): self._url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / "pypi" self._cache = CacheManager({ "default": "releases", "serializer": "json", "stores": { "releases": { "driver": "file", "path": str(release_cache_dir) }, "packages": { "driver": "dict" }, }, }) self._cache_control_cache = FileCache(str(release_cache_dir / "_http")) self._session = CacheControl(session(), cache=self._cache_control_cache) self._inspector = Inspector() super(PyPiRepository, self).__init__() self._name = "PyPI"
def setup_sessions(): # as CacheControl patches the session object, we can't share # one between both feed_sess = CacheControl(requests.Session(), cache=FileCache(args.cache + '/feed')) #article_sess = CacheControl(session, # cache=FileCache(args.cache + '/forever', forever=True)) article_sess = CacheControl( requests.Session(), cache=FileCache(args.cache + '/article'), heuristic=cachecontrol.heuristics.ExpiresAfter(days=2 * 365)) return (feed_sess, article_sess)
class Offers: cache_dir = XDG_CACHE_HOME / 'price-ec2' / 'http' session = CacheControl(requests.Session(), cache=FileCache(cache_dir)) def prices(self, service, region=None): def fetch_json(url): with progress('fetching ' + url): response = self.session.get('https://pricing.us-east-1.amazonaws.com' + url) response.raise_for_status() return response.json() index_url = '/offers/v1.0/aws/index.json' index = fetch_json(index_url) if region: region_index_url = index['offers'][service]['currentRegionIndexUrl'] region_index = fetch_json(region_index_url) region_url = region_index['regions'][region]['currentVersionUrl'] return fetch_json(region_url) else: current_url = index['offers'][service]['currentVersionUrl'] return fetch_json(current_url) @lru_cache() def ec2(self, region): return self.prices('AmazonEC2', region) @lru_cache() def rds(self, region): return self.prices('AmazonRDS', region) @lru_cache() def elasticache(self, region): return self.prices('AmazonElastiCache', region)
def download(workdir, url): """Download a file, using .cache inside workdir as an HTTP cache.""" logging.debug(u"initializing requests and cache-control") session = CacheControl(requests.Session(), cache=FileCache(os.path.join(workdir, '.cache'))) session.mount('file://', LocalFileAdapter()) req = session.get(url, stream=True) try: downloaded_file = tempfile.TemporaryFile() size = 0 start = datetime.datetime.now() for chunk in req.iter_content(chunk_size=1024000): if chunk: sys.stdout.write('.') sys.stdout.flush() downloaded_file.write(chunk) size += len(chunk) # print newline print() downloaded_file.flush() logging.info(u"downloaded {} - {} o. in {} s.", url, size, (datetime.datetime.now() - start).total_seconds()) logging.debug(u"reset file pointer - seek(0)") downloaded_file.seek(0) return downloaded_file except Exception as exc: logging.debug(u"error on download, closing and deleting file") downloaded_file.close() raise exc
def __init__(self): super(ProjectHolder, self).__init__() app_name = __name__.split('.')[0] self.cache_dir = user_cache_dir(app_name) log.info("Using cache directory: {}.".format(self.cache_dir)) self.cache = FileCache(self.cache_dir) cache_adapter = CacheControlAdapter(cache=self.cache) self.mount("http://", cache_adapter) self.mount("https://", cache_adapter) self.headers.update( {'User-Agent': '{}/{}'.format(app_name, __version__)}) log.info('Created instance of {}'.format(type(self).__name__)) self.branches = None self.only = None self.exclude = None self.having_asset = None self.hostname = None # identifies project on a given hostname self.repo = None # short name for "repo", useful in URLs self.name = None # in some case we do not specify repo, but feed is discovered, no repo is given then self.feed_url = None
def all_sites(sitemap_url='http://library.link/harvest/sitemap.xml'): ''' >>> from librarylink.util import all_sites >>> [ s.host for s in all_sites() if 'denverlibrary' in s.host ] ['link.denverlibrary.org'] ''' #FIXME: Avoid accumulating all the nodes, which will require improvements to xml.treesequence @coroutine def sink(accumulator): while True: e = yield loc = next(select_name(e, 'loc')) lastmod = next(select_name(e, 'lastmod')) s = liblink_site() s.sitemap = loc.xml_value s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml') s.base_url = s.url #Legacy property name #Early warning for funky URLs breaking stuff downstream assert not tail protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap) s.lastmod = lastmod.xml_value accumulator.append(s) nodes = [] ts = xml.treesequence(('sitemapindex', 'sitemap'), sink(nodes)) if hasattr (all_sites, 'cachedir'): sess = CacheControl(requests.Session(), cache=FileCache(all_sites.cachedir)) else: sess = CacheControl(requests.Session()) result = sess.get(sitemap_url) ts.parse(result.text) yield from nodes
def center_iterator(client=None) -> Iterator[Dict]: if not PLATFORM_ENABLED: logger.warning( f"{PLATFORM.capitalize()} scrap is disabled in configuration file." ) return [] session = CacheControl(requests.Session(), cache=FileCache("./cache")) if client: session = client try: url = f'{get_config().get("base_urls").get("github_public_path")}{get_conf_outputs().get("centers_json_path").format(PLATFORM)}' response = session.get(url) # Si on ne vient pas des tests unitaires if not client: if response.from_cache: logger.info( f"Liste des centres pour {PLATFORM} vient du cache") else: logger.info( f"Liste des centres pour {PLATFORM} est une vraie requête") data = response.json() logger.info( f"Found {len(data)} {PLATFORM.capitalize()} centers (external scraper)." ) for center in data: yield center except Exception as e: logger.warning(f"Unable to scrape {PLATFORM} centers: {e}")
def __init__(self, name, url): if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name self._cache = CacheManager({ "default": "releases", "serializer": "json", "stores": { "releases": { "driver": "file", "path": str(self._cache_dir) }, "packages": { "driver": "dict" }, "matches": { "driver": "dict" }, }, }) self._session = CacheControl(requests.session(), cache=FileCache( str(self._cache_dir / "_http")))
def __init__( self, name, url, auth=None, disable_cache=False ): # type: (str, str, Optional[Auth], bool) -> None if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name self._cache = CacheManager( { "default": "releases", "serializer": "json", "stores": { "releases": {"driver": "file", "path": str(self._cache_dir)}, "packages": {"driver": "dict"}, "matches": {"driver": "dict"}, }, } ) self._session = CacheControl( requests.session(), cache=FileCache(str(self._cache_dir / "_http")) ) url_parts = urlparse.urlparse(self._url) if not url_parts.username and auth: self._session.auth = auth self._disable_cache = disable_cache
def ec2_catalog(): import requests from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache import logging logger = logging.getLogger('isitfit') logger.debug("Downloading ec2 catalog (cached to local file)") # based on URL = 'http://www.ec2instances.info/instances.json' # URL = 's3://...csv' # Edit 2019-09-10 use CDN link instead of direct gitlab link # URL = 'https://gitlab.com/autofitcloud/www.ec2instances.info-ec2op/raw/master/www.ec2instances.info/t3b_smaller_familyL2.json' URL = 'https://cdn.jsdelivr.net/gh/autofitcloud/www.ec2instances.info-ec2op/www.ec2instances.info/t3b_smaller_familyL2.json' # cached https://cachecontrol.readthedocs.io/en/latest/ sess = requests.session() cached_sess = CacheControl(sess, cache=FileCache('/tmp/isitfit_ec2info.cache')) r = cached_sess.request('get', URL) # read catalog, copy from ec2op-cli/ec2op/optimizer/cwDailyMaxMaxCpu import json j = json.dumps(r.json(), indent=4, sort_keys=True) from pandas import read_json df = read_json(j, orient='split') # Edit 2019-09-13 no need to subsample the columns at this stage # df = df[['API Name', 'Linux On Demand cost']] df = df.rename(columns={'Linux On Demand cost': 'cost_hourly'}) # df = df.set_index('API Name') # need to use merge, not index return df
def __init__(self, url='https://pypi.org/', disable_cache=False, fallback=True): self._name = 'PyPI' self._url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / 'pypi' self._cache = CacheManager({ 'default': 'releases', 'serializer': 'json', 'stores': { 'releases': { 'driver': 'file', 'path': str(release_cache_dir) }, 'packages': { 'driver': 'dict' } } }) self._session = CacheControl( session(), cache=FileCache(str(release_cache_dir / '_http')) ) super(PyPiRepository, self).__init__()
def api_call(endpoint, method, field_name=None): endpoint = endpoint.lstrip('/') headers = {} cache_dir = user_cache_dir("gh") log.info("Using cache directory: {}.".format(cache_dir)) api_token = os.getenv("GITHUB_API_TOKEN") if api_token: log.info("Using API token") headers['Authorization'] = "token {}".format(api_token) with CacheControl(requests.Session(), cache=FileCache(cache_dir)) as s: s.headers.update(headers) if method == 'GET': r = s.get('https://api.github.com/{}'.format(endpoint), headers=headers) rj = r.json() if r.status_code != 200: eprint(json.dumps(rj)) exit(22) if field_name: if field_name in rj: return (rj[field_name]) else: exit(23) else: return json.dumps(r.json()) s.close()
def __init__(self, name, url): if name == 'pypi': raise ValueError('The name [pypi] is reserved for repositories') self._packages = [] self._name = name self._url = url.rstrip('/') self._cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / name self._cache = CacheManager({ 'default': 'releases', 'serializer': 'json', 'stores': { 'releases': { 'driver': 'file', 'path': str(self._cache_dir) }, 'packages': { 'driver': 'dict' }, 'matches': { 'driver': 'dict' } } }) self._session = CacheControl(requests.session(), cache=FileCache( str(self._cache_dir / '_http')))
def __init__(self, name, url, disable_cache=False): if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name self._cache = CacheManager( { "default": "releases", "serializer": "json", "stores": { "releases": {"driver": "file", "path": str(self._cache_dir)}, "packages": {"driver": "dict"}, "matches": {"driver": "dict"}, }, } ) self._session = CacheControl( requests.session(), cache=FileCache(str(self._cache_dir / "_http")) ) url_parts = urlparse.urlparse(self._url) if not url_parts.username: self._session.auth = get_http_basic_auth( Config.create("auth.toml"), self.name ) self._disable_cache = disable_cache
def __init__(self, filename=""): super(BabelNet, self).__init__() if filename == "": filename = "babelnet_cache" self.mount('https://', CacheControlAdapter(cache=FileCache(filename))) self.headers.update({'Accept-Encoding': 'gzip'}) self.params.update({'key': cfg.babelnet_key}) self.endpoint = "https://babelnet.io/v4/"
def get_stored_session(): global __STORED_SESSION create_directory_tree(CACHE_DIR) if __STORED_SESSION is None: __STORED_SESSION = CacheControl(requests.Session(), cache=FileCache(CACHE_DIR)) return __STORED_SESSION
def __init__(self, config={}, cache=None): self.config = config if cache is None: # sticky local cache directory for testing cache = FileCache(".cache", forever=True) self.session = CacheControl(requests.Session(), cache=cache, heuristic=ExpiresAfter(days=30))
def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None): # Airtable and gssutils are using slightly different field names.... self.meta_field_mapping = {"published": "issued"} # Add an explicit on/off for temp scraping (based on presence of dataURL) self.temp_scrape = False # Use seed if provided if seed is not None: with open(seed, "r") as f: self.seed = json.load(f) if "dataURL" in self.seed: logging.warning( "A temporary dataURL has been specified; proceeding with a temp scrape." ) self.temp_scrape = True if "landingPage" not in self.seed.keys(): raise MetadataError( 'We always need to provide a "landingPage" via the seed. Either' " it's own or alongside a dataURL for temporary scrapes." ) uri = self.seed["landingPage"] else: self.seed = None self.uri = uri self.dataset = pmdcat.Dataset(uri) self.catalog = dcat.Catalog() self.dataset.modified = datetime.now(timezone.utc).astimezone() self.distributions = [] if session: self.session = session elif "RECORD_MODE" in os.environ: # don't use cachecontrol, but we'll need to patch the session when used. self.session = requests.Session() else: self.session = CacheControl( requests.Session(), cache=FileCache(".cache"), serializer=BiggerSerializer(), heuristic=LastModified(), ) if "JOB_NAME" in os.environ: self._base_uri = URIRef("http://gss-data.org.uk") self._dataset_id = pathify(os.environ["JOB_NAME"]) else: self._base_uri = BNode() parsed_scrape_uri = urlparse(self.uri) self._dataset_id = (parsed_scrape_uri.netloc.replace(".", "/") + parsed_scrape_uri.path) self.update_dataset_uris() self._run()
def start_http_session(): # Start the cached HTTP Session. # Cache directory will be created if it doesn't exist. cache_path = utils.project_path('.cache') http_session = CacheControl(requests.Session(), heuristic=CacheHeuristic(), cache=FileCache(cache_path)) http_session.headers = get_requests_header() return http_session
def __init__(self, api_user, api_key, aliases): s = requests.Session() s.headers["x-api-user"] = api_user s.headers["x-api-key"] = api_key self.s = CacheControl(s, cache=FileCache(str(HTTP_CACHE))) self.aliases = aliases self.cron_tz = CRON_TZ self.cron_time = CRON_TIME self.cron_file = CRON_FILE
def get_session(*args, **kwargs): session = OAuth2Session(*args, **kwargs) cache_adapter = CacheControlAdapter( cache=FileCache(CACHE_FILE), pool_connections=config.http.connections, pool_maxsize=config.http.connections, max_retries=config.http.retries, ) session.mount("http://", cache_adapter) return session
def __init__( self, name: str, url: str, config: Optional[Config] = None, disable_cache: bool = False, cert: Optional[Path] = None, client_cert: Optional[Path] = None, ) -> None: if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._client_cert = client_cert self._cert = cert self._cache_dir = REPOSITORY_CACHE_DIR / name self._cache = CacheManager({ "default": "releases", "serializer": "json", "stores": { "releases": { "driver": "file", "path": str(self._cache_dir) }, "packages": { "driver": "dict" }, "matches": { "driver": "dict" }, }, }) self._authenticator = Authenticator( config=config or Config(use_environment=True)) self._session = CacheControl(self._authenticator.session, cache=FileCache( str(self._cache_dir / "_http"))) username, password = self._authenticator.get_credentials_for_url( self._url) if username is not None and password is not None: self._authenticator.session.auth = requests.auth.HTTPBasicAuth( username, password) if self._cert: self._authenticator.session.verify = str(self._cert) if self._client_cert: self._authenticator.session.cert = str(self._client_cert) self._disable_cache = disable_cache
def session(self): session = self._authenticator.session if self._basic_auth: session.auth = self._basic_auth if self._cert: session.verify = str(self._cert) if self._client_cert: session.cert = str(self._client_cert) return CacheControl(session, cache=FileCache(str(self._cache_dir / "_http")))
def _default_urlgetter( cache_dir=_DEFAULT_AWS_PRICING_CACHE_DIR, urlgetter=None, ): if urlgetter is None: from requests import Session from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache urlgetter = CacheControl( Session(), cache=FileCache(cache_dir), ) return urlgetter
def __init__( self, url: str = "https://pypi.org/", disable_cache: bool = False, fallback: bool = True, ) -> None: super(PyPiRepository, self).__init__(url.rstrip("/") + "/simple/") self._base_url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = REPOSITORY_CACHE_DIR / "pypi" self._cache = CacheManager({ "default": "releases", "serializer": "json", "stores": { "releases": { "driver": "file", "path": str(release_cache_dir) }, "packages": { "driver": "dict" }, }, }) self._cache_control_cache = FileCache(str(release_cache_dir / "_http")) inner_session = requests.Session() retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504]) inner_session.mount(self._base_url, HTTPAdapter(max_retries=retries)) self._session = CacheControl(inner_session, cache=self._cache_control_cache) self._name = "PyPI"
def __init__(self, uri: str = None, session: requests.Session = None, seed: str = None): # Airtable and gssutils are using slightly different field names.... self.meta_field_mapping = { "published": "issued" } # Add an explicit on/off for temp scraping (based on presence of dataURL) self.temp_scrape = False # Use seed if provided if seed is not None: with open(seed, "r") as f: self.seed = json.load(f) if "dataURL" in self.seed: logging.warning("A temporary dataURL has been specified; proceeding with a temp scrape.") uri = self.seed["dataURL"] self.temp_scrape = True elif "landingPage" not in self.seed: raise MetadataError("Aborting, insufficient seed data. No landing page supplied via " "info.json and no dataURL to use as a fallback.") else: uri = self.seed["landingPage"] else: self.seed = None self.uri = uri self.dataset = pmdcat.Dataset(uri) self.catalog = dcat.Catalog() self.dataset.modified = datetime.now(timezone.utc).astimezone() self.distributions = [] if session: self.session = session else: self.session = CacheControl(requests.Session(), cache=FileCache('.cache'), serializer=BiggerSerializer(), heuristic=LastModified()) if 'JOB_NAME' in os.environ: self._base_uri = URIRef('http://gss-data.org.uk') self._dataset_id = pathify(os.environ['JOB_NAME']) else: self._base_uri = BNode() parsed_scrape_uri = urlparse(self.uri) self._dataset_id = parsed_scrape_uri.netloc.replace('.', '/') + parsed_scrape_uri.path self.update_dataset_uris() self._run()
def __init__(self, app_name: str, expires_after: datetime.timedelta = datetime.timedelta(days=28)): self.app_name: str = str(app_name) self.cache_dir = PathPlus(platformdirs.user_cache_dir(self.app_name)) self.cache_dir.maybe_make(parents=True) self.session: requests.Session = CacheControl( sess=requests.Session(), cache=FileCache(self.cache_dir), heuristic=ExpiresAfter( days=expires_after.days, seconds=expires_after.seconds, microseconds=expires_after.microseconds, ), adapter_class=RateLimitAdapter )
def get_http_session(): global _http_session if _http_session is None: _http_session = requests.session() if cachecontrol: _http_session = cachecontrol.CacheControl( _http_session, cache=FileCache( user_cache_dir(__appname__, __appauthor__), forever=True ), heuristic=ExpiresAfter(days=14), ) return _http_session
def __init__(self): super().__init__() self.name = "MAL" self.logo_url = 'https://upload.wikimedia.org/wikipedia/commons/7/7a/MyAnimeList_Logo.png' self.website_url = 'https://myanimelist.net/' client_id = "add1ed488bd218c2e10146345377a0b8" url_auth = "https://myanimelist.net/v1/oauth2/authorize" url_token = "https://myanimelist.net/v1/oauth2/token" self.authenticator = OAuth(self.name, client_id, url_auth, url_token) self.requests_session = CacheControl(requests.Session(), cache=FileCache('.Cache/MAL'), heuristic=MALHeuristic()) # self.requests_session = requests.Session() self.rate_limiter = AsyncRateLimiter(max_calls=100, period=1, callback=limited)