# Permanent cache (forever) import datetime import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache from cachecontrol.heuristics import ExpiresAfter sess = requests.session() forever_cache_storage = FileCache('.web_cache', forever=True) heuristic = ExpiresAfter(weeks=52) cached_sess = CacheControl(sess, cache=forever_cache_storage, heuristic=heuristic) for n in range(10): start_time = datetime.datetime.now() response = cached_sess.get("https://ya.ru") delta_time = datetime.datetime.now() - start_time print("Time delta: ", delta_time) assert response.status_code == 200
import os import sys import logging import requests import json from cachecontrol import CacheControl from cachecontrol.caches import FileCache cache_dir = '/tmp' logging.basicConfig(stream=sys.stdout, format="%(asctime)s: " + logging.BASIC_FORMAT, datefmt="%Y-%m-%dT%H:%M:%S%z") logger = logging.getLogger(__name__) req = CacheControl(requests.Session(), cache=FileCache(os.path.join(cache_dir, 'pyutu.cache'))) regions = { 'ap-northeast-1': "Asia Pacific (Tokyo)", 'ap-southeast-1': "Asia Pacific (Singapore)", 'ap-southeast-2': "Asia Pacific (Sydney)", 'eu-central-1': "EU (Frankfurt)", 'eu-west-1': "EU (Ireland)", 'sa-east-1': "South America (Sao Paulo)", 'us-east-1': "US East (N. Virginia)", 'us-west-1': "US West (N. California)", 'us-west-2': "US West (Oregon)" } svcs = { "ec2": {
# pointers to remote datasets organisation_csv = os.environ.get( "organisation_csv", "https://raw.githubusercontent.com/digital-land/organisation-dataset/master/collection/organisation.csv", ) region_csv = "https://raw.githubusercontent.com/digital-land/region-collection/master/data/region.csv" def name_to_identifier(n): return n.lower().replace(" ", "-").replace(",", "") # cache files collected session = CacheControl(requests.session(), cache=FileCache(".cache")) def get(url): r = session.get(url) r.raise_for_status() return r.text def get_csv_as_json(path_to_csv, cache=False): if cache: csv_str = get(path_to_csv) csv_pd = pd.read_csv(StringIO(csv_str), sep=",") else: csv_pd = pd.read_csv(path_to_csv, sep=",") return json.loads(csv_pd.to_json(orient="records"))
#!/usr/bin/env python3 import os import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache import tempfile PROJECTS = [356] USERS = ['jptolosa87'] TASK_MANAGER = "https://tasks.kaart.com" CACHE_DIRECTORY = os.path.join(tempfile.gettempdir(), 'task_modified') cached_session = CacheControl(requests.session(), cache=FileCache(CACHE_DIRECTORY)) def getModifiedTasksInProject(project): api = "api/v1/project/{pid}".format(pid=project) with cached_session.get("/".join([TASK_MANAGER, api])) as response: response.raise_for_status() json_data = response.json() tasks = json_data['tasks']['features'] return findModifiedTasks(project, tasks) def findModifiedTasks(project, tasks): api = "api/v1/project/{project}/task/{task}" modified_tasks = [] for task in tasks:
from requests.sessions import Session from cachecontrol import CacheControl import google.auth.transport.requests from google.oauth2 import id_token VALID_ISSUERS = ['accounts.google.com', 'https://accounts.google.com'] CACHED_SESSION = CacheControl(Session()) # XXX Don't hardcode USERINFO_ENDPOINT = "https://openidconnect.googleapis.com/v1/userinfo" def get_idinfo_from_access_token(access_token: str) -> dict: """Fetches user information using the access token provided. Raises ValueError if an error occurs, including the user not being authorized. """ session = Session() r = session.get(USERINFO_ENDPOINT, headers={'Authorization': 'Bearer ' + access_token}) if r.status_code != 200: raise ValueError("Unexpected response code %d" % r.status_code) return r.json() def validate_id_token(idt: str, client_id: str) -> dict: """Validate the id_token passed using Google's validation code. idt is an id_token, which can be extracted from an OpenID Connect authorization response as the 'id_token' field.
def sess(): sess = CacheControl(requests.Session()) yield sess # closing session object sess.close()
''' Class to force storing pages in CacheControl. From CacheControl examples. ''' def update_headers(self, response): date = parsedate(response.headers['date']) expires = datetime(*date[:6]) + timedelta(weeks=1) return { 'expires': formatdate(calendar.timegm(expires.timetuple())), 'cache-control': 'public', } Scrapper._requests = CacheControl(Scrapper._requests, cache=FileCache( tempfile.gettempdir() + '/cagematch-cache', forever=True, ), heuristic=OneWeekHeuristic()) except Exception as e: logging.warning('CacheControl not available:', e) class WikiData(Scrapper): API_URL = 'https://www.wikidata.org/w/api.php' COMMONS_URL = 'https://commons.wikimedia.org/wiki/File:' WRESTLER_ID = 13474373 PROMOTION_ID = 131359
def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False): """ Returns a byte-string retrieved from the url provider. """ # request session cache_dir = sickbeard.CACHE_DIR or _getTempDir() session = CacheControl(sess=session, cache=caches.FileCache( os.path.join(cache_dir, 'sessions'))) # request session headers req_headers = {'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'} if headers: req_headers.update(headers) session.headers.update(req_headers) # request session ssl verify session.verify = False # request session paramaters session.params = params try: # Remove double-slashes from url parsed = list(urlparse.urlparse(url)) parsed[2] = re.sub("/{2,}", "/", parsed[2]) # replace two or more / with one url = urlparse.urlunparse(parsed) # request session proxies if sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) session.proxies = { "http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING, } # decide if we get or post data to server if post_data: resp = session.post(url, data=post_data, timeout=timeout) else: resp = session.get(url, timeout=timeout) if not resp.ok: logger.log( u"Requested url " + url + " returned status code is " + str(resp.status_code) + ': ' + clients.http_error_code[resp.status_code], logger.DEBUG) return except requests.exceptions.HTTPError, e: logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return
def test_file_cache_recognizes_consumed_file_handle(self): s = CacheControl(Session(), FileCache('web_cache')) s.get('http://httpbin.org/cache/60') r = s.get('http://httpbin.org/cache/60') assert r.from_cache
def latest(repo, output_format='version', pre=False, newer_than=False, assets_filter=False, shortUrls=False): # data that we may collect further # the main thing, we're after - parsed version number, e.g. 1.2.3 (no extras chars) version = None # corresponding tag name, e.g. v1.2.3 or v1.2.3-stable (extra chars OK, # used for constructing non-API tar download URLs) tag = None description = None # set this when an API returns json data = None license = None # date of selected release, used in checks # github API returns tags NOT in chronological order # so if author switched from v20150121 (old) to v2.0.1 format, the old value is "higher" # so we have to check if a tag is actually newer, this is very slow but we have to accept :) tagDate = None headers = {} cache_dir = user_cache_dir("lastversion") log.info("Using cache directory: {}.".format(cache_dir)) # Some special non-Github cases for our repository are handled by checking URL # 1. nginx version is taken as version of stable (written by rpm check script) # to /usr/local/share/builder/nginx-stable.ver if repo.startswith(('http://nginx.org/', 'https://nginx.org/')): with open('/usr/local/share/builder/nginx-stable.ver', 'r') as file: return file.read().replace('\n', '') # 2. monit version can be obtained from Bitbucket downloads section of the project elif repo.startswith('https://mmonit.com/'): with CacheControl(requests.Session(), cache=FileCache(cache_dir)) as s: # Special case Monit repo response = s.get( "https://api.bitbucket.org/2.0/repositories/{}/downloads". format("tildeslash/monit"), headers=headers) data = response.json() s.close() return sanitize_version(data['values'][0]['name']) # 3. Everything else is GitHub passed as owner/repo else: # But if full link specified, strip it to owner/repo apiBase = 'https://api.github.com' githubHostname = 'github.com' if repo.startswith(('https://', 'http://')): urlParts = repo.split('/') githubHostname = urlParts[2] repo = urlParts[3] + "/" + urlParts[4] if 'github.com' != githubHostname: apiBase = "https://{}/api/v3".format(githubHostname) # Explicitly specify API version we want: # headers['Accept'] = "application/vnd.github.v3+json" api_token = os.getenv("GITHUB_API_TOKEN") if api_token: headers['Authorization'] = "token {}".format(api_token) with CacheControl(requests.Session(), cache=FileCache(cache_dir)) as s: s.headers.update(headers) # search it :) if '/' not in repo: r = s.get('{}/search/repositories?q={}+in:name'.format( apiBase, repo), headers=headers) repo = r.json()['items'][0]['full_name'] # releases/latest fetches only non-prerelease, non-draft, so it # should not be used for hunting down pre-releases assets if not pre: # https://stackoverflow.com/questions/28060116/which-is-more-reliable-for-github-api-conditional-requests-etag-or-last-modifie/57309763?noredirect=1#comment101114702_57309763 # ideally we disable ETag validation for this endpoint completely r = s.get('{}/repos/{}/releases/latest'.format(apiBase, repo), headers=headers) if r.status_code == 200: the_tag = r.json()['tag_name'] version = sanitize_version(the_tag, pre) if version: log.info( "Set version as current selection: {}.".format( version)) tag = the_tag data = r.json() tagDate = dateutil.parser.parse( r.json()['published_at']) else: r = s.get('{}/repos/{}/releases'.format(apiBase, repo), headers=headers) if r.status_code == 200: for release in r.json(): the_tag = release['tag_name'] the_version = sanitize_version(the_tag, pre) if the_version and ((not version) or (the_version > version)): version = the_version log.info( "Set version as current selection: {}.".format( version)) tag = the_tag data = release tagDate = dateutil.parser.parse( data['published_at']) # formal release may not exist at all, or be "late/old" in case # actual release is only a simple tag so let's try /tags r = s.get('{}/repos/{}/tags'.format(apiBase, repo), headers=headers) if r.status_code == 200: for t in r.json(): the_tag = t['name'] the_version = sanitize_version(the_tag, pre) r_commit = s.get('{}/repos/{}/git/commits/{}'.format( apiBase, repo, t['commit']['sha']), headers=headers) the_date = r_commit.json()['committer']['date'] the_date = dateutil.parser.parse(the_date) if (the_version and ((not version) or (the_version > version))) \ or (not tagDate or the_date > tagDate): # rare case: if upstream filed formal pre-release that passes as stable # version (tag is 1.2.3 instead of 1.2.3b) double check if pre-release # TODO handle API failure here as it may result in "false positive"? if not pre: r = s.get('{}/repos/{}/releases/tags/{}'.format( apiBase, repo, the_tag), headers=headers) if r.status_code == 200: if r.json()['prerelease']: log.info( "Found formal release for this tag which is unwanted " "pre-release: {}.".format(version)) continue version = the_version log.info( "Setting version as current selection: {}.".format( version)) tag = the_tag tagDate = the_date data = t else: sys.stderr.write(r.text) return None if output_format == 'json': r = s.get('{}/repos/{}/license'.format(apiBase, repo), headers=headers) if r.status_code == 200: license = r.json() s.close() # bail out, found nothing that looks like a release if not version: return False # special exit code "2" is useful for scripting to detect if no newer release exists if newer_than and not (version > newer_than): sys.exit(2) # return the release if we've reached far enough: if output_format == 'version': return str(version) elif output_format == 'json': if not data: data = {} if description: description = description.strip() data['version'] = str(version) data['description'] = description data['v_prefix'] = tag.startswith("v") data['spec_tag'] = tag.replace(str(version), "%{upstream_version}") data['tag_name'] = tag data['license'] = license return json.dumps(data) elif output_format == 'assets': urls = [] if 'assets' in data and len(data['assets']) > 0: for asset in data['assets']: if assets_filter: if not re.search(assets_filter, asset['name']): continue else: if os.name == 'nt' and asset['name'].endswith( posixAssetMarkers + darwinAssetMarkers): continue # zips are OK for Linux, so we do some heuristics to weed out Windows stuff if os.name == 'posix' and asset['name'].endswith( darwinAssetMarkers + windowsAssetMarkers): continue urls.append(asset['browser_download_url']) else: download_url = github_tag_download_url(githubHostname, repo, tag, shortUrls) if not assets_filter or re.search(assets_filter, download_url): urls.append(download_url) if not len(urls): sys.exit(3) else: return "\n".join(urls) elif output_format == 'source': return github_tag_download_url(githubHostname, repo, tag, shortUrls)
def latest(repo, output_format='version', pre_ok=False, assets_filter=None, short_urls=False, major=None, only=None, at=None, having_asset=None): """Find latest release version for a project. Args: major (str): Only consider versions which are "descendants" of this major version string short_urls (bool): Whether we should try to return shorter URLs for release data assets_filter (str): Regular expression for filtering assets for the latest release only (str): Only consider tags with this text. Useful for repos with multiple projects repo (str): Repository specifier in any form. output_format (str): Affects return format. Possible values `version`, `json`, `dict`, `assets`, `source`, `tag`. pre_ok (bool): Specifies whether pre-releases can be accepted as newer version. at (str): Specifies repo hosting more precisely, only useful if repo argument was specified as one word. having_asset (Union[str, bool]): Only consider releases with the given asset. Pass `True` for any asset Returns: Version: Newer version object, if found and `output_format` is `version`. Returns: str: Single string containing tag, if found and `output_format` is `tag` """ cache_dir = user_cache_dir("lastversion") log.info("Using cache directory: {}.".format(cache_dir)) repo_data = {} if repo.endswith('.yml') and not repo.startswith(('http://', 'https://')): with open(repo) as fpi: repo_data = yaml.safe_load(fpi) if 'repo' in repo_data: if 'nginx-extras' in repo: repo_data['module_of'] = 'nginx' name = os.path.splitext(os.path.basename(repo))[0] if 'module_of' in repo_data: name = '{}-module-{}'.format(repo_data['module_of'], name) repo = repo_data['repo'] repo_data['name'] = name if repo.startswith( ('http://', 'https://')) and repo.endswith('Chart.yaml'): at = 'helm_chart' if repo.endswith('.spec'): # repo is specified inside the .spec file # github repo is resolved via %{upstream_github} + %{name}/%{upstream_name} # no upstream_github global means that the spec was not prepared for lastversion # optional: use of spec_tag macros if the source is from GitHub. in edge cases we check # new version via GitHub, but prepared sources are elsewhere with open(repo) as f: name = None upstream_github = None upstream_name = None current_version = None spec_repo = None spec_url = None for l in f.readlines(): if l.startswith('%global lastversion_repo'): spec_repo = l.split(' ')[2].strip() elif l.startswith('%global upstream_github'): upstream_github = l.split(' ')[2].strip() elif l.startswith('%global upstream_name'): upstream_name = l.split(' ')[2].strip() elif l.startswith('Name:'): name = l.split('Name:')[1].strip() elif l.startswith('URL:'): spec_url = l.split('URL:')[1].strip() elif l.startswith('%global upstream_version '): current_version = l.split(' ')[2].strip() # influences %spec_tag to use %upstream_version instead of %version repo_data['module_of'] = True elif l.startswith('Version:') and not current_version: current_version = l.split('Version:')[1].strip() if spec_url: spec_host = urlparse(spec_url).hostname if spec_host in ['github.com' ] and not upstream_github and not spec_repo: log.warning( 'Neither %upstream_github nor %lastversion_repo macros were found. ' 'Please prepare your spec file using instructions: ' 'https://lastversion.getpagespeed.com/spec-preparing.html' ) if not current_version: log.critical( 'Did not find neither Version: nor %upstream_version in the spec file' ) sys.exit(1) try: if current_version != 'x': repo_data['current_version'] = Version(current_version) except InvalidVersion: log.critical( 'Failed to parse current version in {}. Tried {}'.format( repo, current_version)) sys.exit(1) if upstream_name: repo_data['name'] = upstream_name repo_data['spec_name'] = '%{upstream_name}' else: repo_data['name'] = name repo_data['spec_name'] = '%{name}' if upstream_github: repo = "{}/{}".format(upstream_github, repo_data['name']) log.info( 'Discovered GitHub repo {} from .spec file'.format(repo)) elif spec_repo: repo = spec_repo log.info( 'Discovered explicit repo {} from .spec file'.format(repo)) elif spec_url: repo = spec_url if (not at or '/' in repo) and at != 'helm_chart': # find the right hosting for this repo project_holder = HolderFactory.get_instance_for_repo(repo, only=only) else: project_holder = HolderFactory.HOLDERS[at](repo, hostname=None) project_holder.set_only(only) project_holder.set_having_asset(having_asset) # we are completely "offline" for 1 hour, not even making conditional requests # heuristic=ExpiresAfter(hours=1) <- make configurable with CacheControl(project_holder, cache=FileCache(cache_dir)) as s: release = s.get_latest(pre_ok=pre_ok, major=major) s.close() # bail out, found nothing that looks like a release if not release: return None from_type = 'Located the latest release tag {} at: {}'.format( release['tag_name'], project_holder.get_canonical_link()) if 'type' in release: from_type = '{} via {} mechanism'.format(from_type, release['type']) log.info(from_type) version = release['version'] tag = release['tag_name'] # return the release if we've reached far enough: if output_format == 'version': return version if output_format in ['json', 'dict']: if output_format == 'dict': release['version'] = version else: release['version'] = str(version) if 'tag_date' in release: release['tag_date'] = str(release['tag_date']) release['v_prefix'] = tag.startswith("v") version_macro = 'upstream_version' if 'module_of' in repo_data else 'version' version_macro = '%{{{}}}'.format(version_macro) holder_i = {value: key for key, value in HolderFactory.HOLDERS.items()} release['source'] = holder_i[type(project_holder)] release['spec_tag'] = tag.replace(str(version), version_macro) # spec_tag_no_prefix is the helpful macro which will allow us to know where tarball # extracts to (GitHub-specific) if release['spec_tag'].startswith('v{}'.format(version_macro)) or \ re.match(r'^v\d', release['spec_tag']): release['spec_tag_no_prefix'] = release['spec_tag'].lstrip('v') else: release['spec_tag_no_prefix'] = release['spec_tag'] release['tag_name'] = tag if hasattr(s, 'repo_license'): release['license'] = s.repo_license(tag) if hasattr(s, 'repo_readme'): release['readme'] = s.repo_readme(tag) release.update(repo_data) try: release['assets'] = s.get_assets(release, short_urls, assets_filter) except NotImplementedError: pass release['from'] = project_holder.get_canonical_link() return release if output_format == 'assets': return s.get_assets(release, short_urls, assets_filter) if output_format == 'source': return s.release_download_url(release, short_urls) if output_format == 'tag': return tag return None
# # WG Notifications of deaths of residents related to COVID-19 in adult care homes from gssutils import * import json import numpy as np if is_interactive(): from requests import Session from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache from cachecontrol.heuristics import ExpiresAfter scrape = Scraper(seed="info.json", session=CacheControl(Session(), cache=FileCache('.cache'), heuristic=ExpiresAfter(days=1))) dist = scrape.distribution( latest=True, title=lambda x: x.startswith( 'Notifications of deaths of residents related to COVID-19')) tabs = {tab.name: tab for tab in dist.as_databaker()} list(tabs) # + def left(s, amount): return s[:amount] def right(s, amount): return s[-amount:]
'size': '1', 'zone': 'one' } create_fields['csrf_token'] = csrf_token self.session.post(self.url + 'volumes/create', data=create_fields) if __name__ == "__main__": requests.packages.urllib3.disable_warnings() url = URL num_users = NUM_USERS num_iterations = NUM_ITERATIONS if len(sys.argv) > 1: url = sys.argv[1] if len(sys.argv) > 2: num_users = int(sys.argv[2]) if len(sys.argv) > 3: num_iterations = int(sys.argv[3]) else: print "usage: reqgenerator.py <console-url> [num sessions] [num iterations/session]" sys.exit() # start a bunch of users for i in range(0, num_users): s = requests.Session() s = CacheControl(s) print "Starting user: " + str(i) u = BrowsingUser(url, s, 'user' + str(i), num_iterations) u.login('ui-test-acct-00', 'admin', 'mypassword0') Thread(target=u).start() time.sleep(2)
import configparser import json from typing import Mapping import requests from cachecontrol import CacheControl # type: ignore REQUEST_CACHE = CacheControl(requests.session()) BASE_URL = "https://api.weather.gov" class Location: short_name: str long_name: str forecast_url: str hourly_forecast_url: str alert_url: str def __init__(self, short_name: str, long_name: str, latitude: float, longitude: float): self.short_name = short_name self.long_name = long_name trunc = lambda f: format(f, '.4f') url = f"{BASE_URL}/points/{trunc(latitude)},{trunc(longitude)}" data = fetch_json(url, 'point')['properties'] self.forecast_url = data['forecast'] self.hourly_forecast_url = data['forecastHourly']
def getNewName(uid): api_url = "https://www.openstreetmap.org/api/0.6/user/{}".format(uid) session = CacheControl(requests.session()) result = session.get(api_url).text root = ET.fromstring(result) return root.find("user").attrib['display_name']
def cli(url, repositories, search, rows, minstar, token, output_file_name, max_repos_retrieved): MODE = os.environ.get("GHTOPDEP_ENV") REPOS_PER_FILE_SIZE_LIMIT = 3000 if (search) and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) elif (search) and not token: click.echo("Please provide token") sys.exit() destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" repos = [] more_than_zero_count = 0 total_repos_count = 0 # spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") # spinner.start() sess = requests.session() retries = Retry(total=15, backoff_factor=15, status_forcelist=[429]) adapter = CacheControlAdapter(max_retries=retries, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) sess.mount("http://", adapter) sess.mount("https://", adapter) page_url = get_page_url(sess, url, destination) found_repos = 0 total_found_repos = 0 number_of_files_processed = 0 while True: time.sleep(1) response = sess.get(page_url) print(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = repo_stars_list[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css( REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: # print("adding repo ", repo_url) found_repos += 1 total_found_repos += 1 repos.append({"url": repo_url, "stars": repo_stars_num}) if found_repos >= REPOS_PER_FILE_SIZE_LIMIT: sorted_repos = repos repos = [] number_of_files_processed += 1 found_repos = 0 show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, number_of_files_processed, output_file_name) print("JSON output placed into file!") if total_found_repos > max_repos_retrieved: print(f'Collected {total_found_repos} repos.') exit node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": # spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] sorted_repos = repos if search: for repo in repos: repo_path = urlparse(repo["url"]).path[1:] for s in gh.search_code("{0} repo:{1}".format(search, repo_path)): click.echo("{0} with {1} stars".format(s.html_url, repo["stars"])) elif number_of_files_processed == 0: show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, number_of_files_processed, output_file_name)
def __init__(self): cachefile = f"{config['cachedir']}/requests" self._cached_sess = CacheControl(requests.Session(), cache=FileCache(cachefile))
def download_wheel(url: str, expected_md5: str) -> bytes: session = requests.session() cached_session = CacheControl(session, cache=FileCache(".web_cache")) response = cached_session.get(url) return response.content
def cli(url, repositories, search, table, rows, minstar, report, description, token): MODE = os.environ.get("GHTOPDEP_ENV") BASE_URL = 'https://437w61gcj1.execute-api.us-west-2.amazonaws.com/api' if MODE == "development": BASE_URL = 'http://127.0.0.1:8080' if report: try: result = requests.get('{}/repos?url={}'.format(BASE_URL, url)) if result.status_code != 404: sorted_repos = sort_repos(result.json()['deps'], rows) repos = readable_stars(sorted_repos) click.echo(tabulate(repos, headers="keys", tablefmt="github")) sys.exit() except requests.exceptions.ConnectionError as e: click.echo(e) if (description or search) and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) elif (description or search) and not token: click.echo("Please provide token") sys.exit() destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" page_url = "{0}/network/dependents?dependent_type={1}".format( url, destination.upper()) repos = [] more_than_zero_count = 0 total_repos_count = 0 spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") spinner.start() sess = requests.session() retries = Retry(total=15, backoff_factor=15, status_forcelist=[429]) adapter = CacheControlAdapter(max_retries=retries, cache=FileCache(CACHE_DIR), heuristic=OneDayHeuristic()) sess.mount("http://", adapter) sess.mount("https://", adapter) while True: response = sess.get(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = repo_stars_list[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css( REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: if description: repo_description = fetch_description( gh, relative_repo_url) repos.append({ "url": repo_url, "stars": repo_stars_num, "description": repo_description }) else: repos.append({ "url": repo_url, "stars": repo_stars_num }) node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] if report: try: requests.post('{}/repos'.format(BASE_URL), json={ "url": url, "deps": repos }) except requests.exceptions.ConnectionError as e: click.echo(e) sorted_repos = sort_repos(repos, rows) if search: for repo in repos: repo_path = urlparse(repo["url"]).path[1:] for s in gh.search_code("{0} repo:{1}".format(search, repo_path)): click.echo("{0} with {1} stars".format(s.html_url, repo["stars"])) else: show_result(sorted_repos, total_repos_count, more_than_zero_count, destinations, table)
import requests from cachecontrol import CacheControl session = requests.session() cached_session = CacheControl(session) # sessionをラップしたcached_sessionを作る。 # 1回目はキャッシュがないので、サーバーから取得しキャッシュする。 response = cached_session.get('https://docs.python.org/3/') print(response.from_cache) # False # 2回目はETagとLast-Modifiedの値を使って更新されているかを確認する。 # 更新されていない場合のコンテンツはキャッシュから取得するので高速に処理できる。 response = cached_session.get('https://docs.python.org/3/') print(response.from_cache) # True
import sys from cachecontrol import CacheControl from datetime import datetime from itunes import HOST_NAME __all__ = [ 'TS_FORMAT', 'SESSION', 'ITunesException', 'BaseObject', 'Resource', 'NoResultsFoundException', 'Artist', 'Album', 'Track', 'Audiobook', 'Software', 'TVEpisode' ] #: iTunes API Timestamp format TS_FORMAT = '%Y-%m-%dT%H:%M:%S' #: Globally accessible cache-enabled requests session SESSION = CacheControl(requests.session()) class ITunesException(Exception): """Base iTunes request exception""" def __init__(self, message): self.message = message def __str__(self): return '{type}: {msg}'.format(type=self.__class__.__name__, msg=self.message) class NoResultsFoundException(ITunesException): """iTunes error for when no results are returned from a Lookup""" def __init__(self):
def sess(self, url, tmpdir): self.url = url self.cache = FileCache(str(tmpdir)) sess = CacheControl(requests.Session(), cache=self.cache) return sess
def includeme(config): # pragma: no cover dataseturigenerator = UriPatternGenerator('https://id.erfgoed.net/datasets/thesauri/%s') TREES = SQLAlchemyProvider( {'id': 'TREES', 'conceptscheme_id': 1}, config.registry.dbmaker ) GEO = SQLAlchemyProvider( {'id': 'GEOGRAPHY', 'conceptscheme_id': 2}, config.registry.dbmaker ) STYLES = SQLAlchemyProvider( { 'id': 'STYLES', 'conceptscheme_id': 3, 'dataset': { 'uri': dataseturigenerator.generate(id='stijlen_en_culturen'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2008,2,14)], 'language': ['nl-BE'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/stijlen_en_culturen/%s') ) MATERIALS = SQLAlchemyProvider( { 'id': 'MATERIALS', 'conceptscheme_id': 4, 'dataset': { 'uri': dataseturigenerator.generate(id='materialen'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2011,3,16)], 'language': ['nl-BE'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/materialen/%s') ) EVENTTYPES = SQLAlchemyProvider( { 'id': 'EVENTTYPE', 'conceptscheme_id': 5, 'dataset': { 'uri': dataseturigenerator.generate(id='gebeurtenistypes'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2010,8,13)], 'language': ['nl-BE'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/gebeurtenistypes/%s') ) HERITAGETYPES = SQLAlchemyProvider( { 'id': 'HERITAGETYPE', 'conceptscheme_id': 6, 'dataset': { 'uri': dataseturigenerator.generate(id='erfgoedtypes'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2008,2,14)], 'language': ['nl-BE'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/erfgoedtypes/%s') ) PERIODS = SQLAlchemyProvider( { 'id': 'PERIOD', 'conceptscheme_id': 7, 'dataset': { 'uri': dataseturigenerator.generate(id='dateringen'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2008,2,14)], 'language': ['nl-BE'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/dateringen/%s') ) SPECIES = SQLAlchemyProvider( { 'id': 'SPECIES', 'conceptscheme_id': 8, 'dataset': { 'uri': dataseturigenerator.generate(id='soorten'), 'publisher': ['https://id.erfgoed.net/actoren/501'], 'created': [date(2011,5,23)], 'language': ['nl-BE', 'la'], 'license': [ 'https://creativecommons.org/licenses/by/4.0/', 'http://data.vlaanderen.be/doc/licentie/modellicentie-gratis-hergebruik/v1.0' ] } }, config.registry.dbmaker, uri_generator=UriPatternGenerator('https://id.erfgoed.net/thesauri/soorten/%s') ) # use 'subject': ['external'] for read only external providers (only available in REST service) getty_session = CacheControl(requests.Session(), heuristic=ExpiresAfter(weeks=1)) AAT = AATProvider( {'id': 'AAT', 'subject': ['external']}, session=getty_session ) TGN = TGNProvider( {'id': 'TGN', 'subject': ['external']}, session=getty_session ) eh_session = CacheControl(requests.Session(), heuristic=ExpiresAfter(weeks=1)) EH_PERIOD = HeritagedataProvider( {'id': 'EH_PERIOD', 'subject': ['external']}, scheme_uri='http://purl.org/heritagedata/schemes/eh_period', session=eh_session ) EH_MONUMENT_TYPE = HeritagedataProvider( {'id': 'EH_MONUMENT_TYPE', 'subject': ['external']}, scheme_uri='http://purl.org/heritagedata/schemes/eh_tmt2', session=eh_session ) EH_MATERIALS = HeritagedataProvider( {'id': 'EH_MATERIALS', 'subject': ['external']}, scheme_uri='http://purl.org/heritagedata/schemes/eh_tbm', session=eh_session ) skosregis = config.get_skos_registry() skosregis.register_provider(TREES) skosregis.register_provider(GEO) skosregis.register_provider(STYLES) skosregis.register_provider(MATERIALS) skosregis.register_provider(EVENTTYPES) skosregis.register_provider(HERITAGETYPES) skosregis.register_provider(PERIODS) skosregis.register_provider(SPECIES) skosregis.register_provider(AAT) skosregis.register_provider(TGN) skosregis.register_provider(EH_PERIOD) skosregis.register_provider(EH_MONUMENT_TYPE) skosregis.register_provider(EH_MATERIALS)
def setUp(self): self.url = "https://httpbin.org/cache/60" self.sess = CacheControl(requests.Session(), cache=SQLiteCache(":memory:"))
from datetime import datetime as dt import io import logging logger = logging.getLogger() logger.setLevel(logging.DEBUG) logging.debug("Debug level logging turned on") import requests from cachecontrol import CacheControl from cachecontrol.caches import FileCache from cachecontrol.heuristics import ExpiresAfter sess = requests.session() cached_sess = CacheControl(sess, cache=FileCache('.web_cache'), heuristic=ExpiresAfter(hours=1)) try: response = cached_sess.get( 'https://rgtdb.com/events/json?search=&offset=0&limit=200' ) # Get 200 events. Should be about a week's worth of events response.raise_for_status() except HTTPError as http_err: print(f'HTTP error occurred: {http_err}') except Exception as err: print(f'Other error occurred: {err}') logger.setLevel(logging.ERROR)
from cachecontrol import CacheControl from jose import jwt from jose.exceptions import JWTError _GOOGLE_OAUTH2_CERTS_URL = "https://www.googleapis.com/oauth2/v1/certs" OAUTH2_PROVIDER = { 'issuer': 'accounts.google.com', 'audience': os.getenv('OAUTH2_CLIENT_ID'), 'options': { 'verify_at_hash': False } } sess = requests.session() cached_sess = CacheControl(sess) def current_user(): return g.get('auth_user_payload') def gsuite_authenticate(f): @wraps(f) def wrapper(*args, **kwargs): token = request.headers.get('Authorization', '') token = token.replace('Bearer ', '').replace('bearer ', '') if not token: return jsonify({ 'message': 'Unauthorized. No Authorization token provided.', 'code': 401
def cli(url, repositories, rows, minstar, description, token): if description and token: gh = github3.login(token=token) CacheControl(gh.session, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic()) Repo = namedtuple("Repo", ["url", "stars", "description"]) elif description and not token: click.echo("Please provide token") else: Repo = namedtuple("Repo", ["url", "stars"]) destination = "repository" destinations = "repositories" if not repositories: destination = "package" destinations = "packages" page_url = "{0}/network/dependents?dependent_type={1}".format(url, destination.upper()) repos = [] more_than_zero_count = 0 total_repos_count = 0 spinner = Halo(text="Fetching information about {0}".format(destinations), spinner="dots") spinner.start() sess = requests.session() cached_sess = CacheControl(sess, cache=FileCache(".ghtopdep_cache"), heuristic=OneDayHeuristic()) while True: response = cached_sess.get(page_url) parsed_node = HTMLParser(response.text) dependents = parsed_node.css(ITEM_SELECTOR) total_repos_count += len(dependents) for dep in dependents: repo_stars_list = dep.css(STARS_SELECTOR) # only for ghost or private? packages if repo_stars_list: repo_stars = dep.css(STARS_SELECTOR)[0].text().strip() repo_stars_num = int(repo_stars.replace(",", "")) else: continue if repo_stars_num != 0: more_than_zero_count += 1 if repo_stars_num >= minstar: relative_repo_url = dep.css(REPO_SELECTOR)[0].attributes["href"] repo_url = "{0}{1}".format(GITHUB_URL, relative_repo_url) # can be listed same package is_already_added = already_added(repo_url, repos) if not is_already_added and repo_url != url: if description: repo_description = fetch_description(gh, relative_repo_url) repos.append(Repo(repo_url, repo_stars_num, repo_description)) else: repos.append(Repo(repo_url, repo_stars_num)) node = parsed_node.css(NEXT_BUTTON_SELECTOR) if len(node) == 2: page_url = node[1].attributes["href"] elif len(node) == 0 or node[0].text() == "Previous": spinner.stop() break elif node[0].text() == "Next": page_url = node[0].attributes["href"] sorted_repos = sort_repos(repos, rows) show_result(sorted_repos, total_repos_count, more_than_zero_count, destination, destinations)
# utf-8 encoding needed since it's used for the bot # -*- coding: utf-8 -*- import sys import re import requests from cachecontrol import CacheControl from bs4 import BeautifulSoup session = requests.session() cached_session = CacheControl(session) # URL for the chatbot URL = "https://kakko.pandorabots.com/pandora/talk?botid=f6a012073e345a08&skin=chat" # Regex pattern to get the appropriate data PATTERN = re.compile("</b>((.|\n)*?)<br>") def ask_mitsuku(message): # Payload with message to POST payload = {'message': message} # Make POST request r = cached_session.post(URL, data=payload) # Parse data for Mitsuku's response soup = BeautifulSoup(r.content, 'html.parser') content = str(soup.p) pat = re.findall(PATTERN, content)
#!/usr/bin/python3 import os, requests from cachecontrol import CacheControl import datetime import hashlib, json import zipfile from fabricutil import * from cachecontrol.caches import FileCache forever_cache = FileCache('http_cache', forever=True) sess = CacheControl(requests.Session(), forever_cache) def mkdirs(path): if not os.path.exists(path): os.makedirs(path) def filehash(filename, hashtype, blocksize=65536): hash = hashtype() with open(filename, "rb") as f: for block in iter(lambda: f.read(blocksize), b""): hash.update(block) return hash.hexdigest() def get_maven_url(mavenKey, server, ext): mavenParts = mavenKey.split(":", 3) mavenVerUrl = server + mavenParts[0].replace( ".", "/") + "/" + mavenParts[1] + "/" + mavenParts[2] + "/"
def get_frag_by_loc_from_osm(imtiles_file, loci, zoom_level=0, padding=0, tile_size=256, no_cache=False): width = 360 height = 180 ims = [] prefixes = ['a', 'b', 'c'] prefix_idx = math.floor(random() * len(prefixes)) osm_src = 'http://{}.tile.openstreetmap.org'.format(prefixes[prefix_idx]) s = CacheControl(requests.Session()) for locus in loci: id = locus[-1] if not no_cache: osm_snip = None try: osm_snip = np.load(BytesIO(rdb.get('osm_snip_%s' % id))) if osm_snip is not None: ims.append(osm_snip) continue except: pass start_lng = locus[0] end_lng = locus[1] start_lat = locus[2] end_lat = locus[3] if not is_within(start_lng + 180, end_lng + 180, end_lat + 90, start_lat + 90, width, height): ims.append(None) continue # Get tile ids start1, start2 = get_tile_pos_from_lng_lat(start_lng, start_lat, zoom_level) end1, end2 = get_tile_pos_from_lng_lat(end_lng, end_lat, zoom_level) xPad = padding * (end1 - start1) yPad = padding * (start2 - end2) start1 -= xPad end1 += xPad start2 += yPad end2 -= yPad tile_start1_id = math.floor(start1) tile_start2_id = math.floor(start2) tile_end1_id = math.floor(end1) tile_end2_id = math.floor(end2) start1 = math.floor(start1 * tile_size) start2 = math.floor(start2 * tile_size) end1 = math.ceil(end1 * tile_size) end2 = math.ceil(end2 * tile_size) tiles_x_range = range(tile_start1_id, tile_end1_id + 1) tiles_y_range = range(tile_start2_id, tile_end2_id + 1) # Make sure that no more than 6 standard tiles (256px) are loaded. if tile_size * len(tiles_x_range) > hss.SNIPPET_OSM_MAX_DATA_DIM: raise SnippetTooLarge() if tile_size * len(tiles_y_range) > hss.SNIPPET_OSM_MAX_DATA_DIM: raise SnippetTooLarge() # Extract image tiles tiles = [] for y in tiles_y_range: for x in tiles_x_range: src = ('{}/{}/{}/{}.png'.format(osm_src, zoom_level, x, y)) r = s.get(src) if r.status_code == 200: tiles.append(Image.open(BytesIO(r.content)).convert('RGB')) else: tiles.append(None) osm_snip = get_frag_from_image_tiles(tiles, tile_size, tiles_x_range, tiles_y_range, tile_start1_id, tile_start2_id, start1, end1, start2, end2) if not no_cache: with BytesIO() as b: np.save(b, osm_snip) rdb.set('osm_snip_%s' % id, b.getvalue(), 60 * 30) ims.append(osm_snip) return ims