def all_sites(sitemap_url='http://library.link/harvest/sitemap.xml'): ''' >>> from librarylink.util import all_sites >>> [ s.host for s in all_sites() if 'denverlibrary' in s.host ] ['link.denverlibrary.org'] ''' #FIXME: Avoid accumulating all the nodes, which will require improvements to xml.treesequence @coroutine def sink(accumulator): while True: e = yield loc = next(select_name(e, 'loc')) lastmod = next(select_name(e, 'lastmod')) s = liblink_site() s.sitemap = loc.xml_value s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml') s.base_url = s.url #Legacy property name #Early warning for funky URLs breaking stuff downstream assert not tail protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap) s.lastmod = lastmod.xml_value accumulator.append(s) nodes = [] ts = xml.treesequence(('sitemapindex', 'sitemap'), sink(nodes)) if hasattr (all_sites, 'cachedir'): sess = CacheControl(requests.Session(), cache=FileCache(all_sites.cachedir)) else: sess = CacheControl(requests.Session()) result = sess.get(sitemap_url) ts.parse(result.text) yield from nodes
def sess(self, url, tmpdir): self.url = url self.cache = FileCache(str(tmpdir)) sess = CacheControl(requests.Session(), cache=self.cache) yield sess # closing session object sess.close()
def get_reader(self): sess = CacheControl(requests.Session(), cache=FileCache(gettempdir())) req = sess.get(self.file) # if the response is not 200, an exception will be raised req.raise_for_status() return io.BufferedReader(io.BytesIO(req.content))
def sess(self, url): self.etag_url = urljoin(url, "/etag") self.update_etag_url = urljoin(url, "/update_etag") self.cache = DictCache() sess = CacheControl( requests.Session(), cache=self.cache, serializer=NullSerializer() ) yield sess # closing session object sess.close()
def get_cached_session(): sess = CacheControl(requests.Session(), cache=FileCache(CACHE_DIR), heuristic=LastModifiedNoDate(require_date=False)) original_get = sess.get def wrapped_get(*args, **kwargs): try: return original_get(*args, **kwargs) except (OSError, IOError) as e: return requests.get(*args, **kwargs) sess.get = wrapped_get return sess
def fetch_file(url, encoding=None): s = requests.session() s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache'))) try: response = s.get(url, headers={}) except requests.ConnectionError: _assert(False, "Connection failed... check your internet connection") _assert(response.ok, "%s\nFile request failed: %s (%d)" % (url, response.reason, response.status_code)) if encoding: response.encoding = encoding return response.text
def send_answer(self, answer, key): s = requests.session() s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache'))) url = "%s/%s/answers" % (self.url, key) data = data2json(answer).encode('utf-8') tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json')) headers = {"Authorization": "Bearer %s" % tokens.get(self.name)} try: response = s.post(url, headers=headers, data=data, allow_redirects=True) except requests.ConnectionError: _assert(False, "Connection failed... check your internet connection (1)") return response
def test_not_modified_releases_connection(self, server): sess = CacheControl(requests.Session()) etag_url = urljoin(server.application_url, "/etag") sess.get(etag_url) resp = Mock(status=304, headers={}) # This is how the urllib3 response is created in # requests.adapters response_mod = "requests.adapters.HTTPResponse.from_httplib" with patch(response_mod, Mock(return_value=resp)): sess.get(etag_url) assert resp.read.called assert resp.release_conn.called
class TestStream(object): def setup(self): self.sess = CacheControl(requests.Session()) def test_stream_is_cached(self, url): resp_1 = self.sess.get(url + 'stream') content_1 = resp_1.content resp_2 = self.sess.get(url + 'stream') content_2 = resp_1.content assert not resp_1.from_cache assert resp_2.from_cache assert content_1 == content_2
class JSONLocator(Locator): def __init__(self, url=PYPI_JSON_URL): self.url = url self.session = CacheControl(requests.session()) def versions(self, distribution): url = "{}/{}/json".format(self.url, distribution) response = self.session.get(url) ret = [] j = response.json()['releases'] return [v for v, d in j.items() if len(d) > 0] def get(self, distribution, version): url = "{}/{}/json".format(self.url, distribution) response = self.session.get(url) # Reformat the data... return response.json()['releases'][version]
def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False): """ Returns a byte-string retrieved from the url provider. """ # request session cache_dir = sickbeard.CACHE_DIR or _getTempDir() session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, "sessions"))) # request session headers req_headers = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip,deflate"} if headers: req_headers.update(headers) session.headers.update(req_headers) # request session ssl verify session.verify = False # request session paramaters session.params = params try: # Remove double-slashes from url parsed = list(urlparse.urlparse(url)) parsed[2] = re.sub("/{2,}", "/", parsed[2]) # replace two or more / with one url = urlparse.urlunparse(parsed) # request session proxies if sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) session.proxies = {"http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING} resp = session.get(url, data=post_data, timeout=timeout) if not resp.ok: logger.log( u"Requested url " + url + " returned status code is " + str(resp.status_code) + ": " + clients.http_error_code[resp.status_code], logger.DEBUG, ) return except requests.exceptions.HTTPError, e: logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return
class LDClient(object): def __init__(self, apiKey, config=Config.default()): self._apiKey = apiKey self._config = config self._session = CacheControl(requests.Session()) def get_flag(self, key, user, default=False): try: return self._get_flag(key, user, default) except: logging.exception('Unhandled exception in get_flag. Returning default value for flag.') return default def _get_flag(self, key, user, default): hdrs = {'Authorization': 'api_key ' + self._apiKey, 'User-Agent': 'PythonClient/' + __version__} uri = self._config._base_uri + '/api/eval/features/' + key r = self._session.get(uri, headers=hdrs, timeout = (self._config._connect, self._config._read)) dict = r.json() val = _evaluate(dict, user) if val is None: return default else: return val
def __init__(self, name, url, disable_cache=False): if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name self._cache = CacheManager( { "default": "releases", "serializer": "json", "stores": { "releases": {"driver": "file", "path": str(self._cache_dir)}, "packages": {"driver": "dict"}, "matches": {"driver": "dict"}, }, } ) self._session = CacheControl( requests.session(), cache=FileCache(str(self._cache_dir / "_http")) ) url_parts = urlparse.urlparse(self._url) if not url_parts.username: self._session.auth = get_http_basic_auth(self.name) self._disable_cache = disable_cache
def __init__(self): self.s = requests.Session() # We cache ALL responses for 60 min. so eg. inline lyrics request don't make two calls right after each other. # This MAY have unforeseen consequences, but hopefully we can deal with those. self.s = CacheControl(self.s, cache_etags=False, heuristic=ExpiresAfter(minutes=60)) self.s.headers.update({'Accept': 'application/json', 'User-Agent': VOCADB_USER_AGENT}) self.opts = {'nameMatchMode': 'Auto', 'getTotalCount': 'true'} self._resources = {}
def setup(self): class DummyHeuristic(BaseHeuristic): def update_headers(self, resp): return {"x-dummy-header": "foobar"} self.sess = CacheControl(Session(), heuristic=DummyHeuristic())
def download_file(url, filename, session=None): # create session cache_dir = sickbeard.CACHE_DIR or _getTempDir() session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, "sessions"))) # request session headers session.headers.update({"User-Agent": USER_AGENT, "Accept-Encoding": "gzip,deflate"}) # request session ssl verify session.verify = False # request session streaming session.stream = True # request session proxies if sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) session.proxies = {"http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING} try: resp = session.get(url) if not resp.ok: logger.log( u"Requested url " + url + " returned status code is " + str(resp.status_code) + ": " + clients.http_error_code[resp.status_code], logger.DEBUG, ) return False with open(filename, "wb") as fp: for chunk in resp.iter_content(chunk_size=1024): if chunk: fp.write(chunk) fp.flush() chmodAsParent(filename) except requests.exceptions.HTTPError, e: _remove_file_failed(filename) logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return False
def _get_filehandle(filepath_or, *args, **kwargs): """Open file if `filepath_or` looks like a string/unicode/bytes, else pass through. """ if _is_string_or_bytes(filepath_or): if requests.compat.urlparse(filepath_or).scheme in {'http', 'https'}: sess = CacheControl(requests.Session(), cache=FileCache(gettempdir())) req = sess.get(filepath_or, **kwargs) # if the response is not 200, an exception will be raised req.raise_for_status() fh, own_fh = BytesIO(req.content), True else: fh, own_fh = open(filepath_or, *args, **kwargs), True else: fh, own_fh = filepath_or, False return fh, own_fh
def setup(self): class NoopHeuristic(BaseHeuristic): warning = Mock() def update_headers(self, resp): return {} self.heuristic = NoopHeuristic() self.sess = CacheControl(Session(), heuristic=self.heuristic)
def __init__(self): try: from cachecontrol import CacheControl from cachecontrol.caches import FileCache import tempfile self._requests = CacheControl(self._requests, cache=FileCache(tempfile.gettempdir()+'/cagematch-cache', forever=True)) except: logging.warning('CacheControl not available') self._requests.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})
def Request( url, method="GET", headers=DEFAULT_HEADERS, additional_headers=None, data=None, session=None, allow_redirects=True, timeout=10, load_cookies=True, mobile=False ): if additional_headers: headers.update(additional_headers) try: session = CacheControl(session) except Exception as e: pass # Error("Init web cache failed!!!", e) if mobile: headers["User-Agents"] = MOBILE_IOS_AGENTS xbmc.log("Requests headers: {0}".format(json.dumps(headers)), 1) if session: session.headers.update(headers) domain = re.search("https*\://(.+?)($|/)", url).group(1) if load_cookies: LoadCookies(session, cookies_name=domain) if data: response = session.post(url, data=data, allow_redirects=allow_redirects, timeout=timeout, verify=False) else: if method == "HEAD": response = session.head(url, allow_redirects=allow_redirects, timeout=timeout, verify=False) else: response = session.get(url, allow_redirects=allow_redirects, timeout=timeout, verify=False) response.encoding = "utf8" SaveCookies(session, cookies_name=domain) return response else: if method == "HEAD": return requests.head(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout, verify=False) else: return requests.get(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout, verify=False)
class reQuiver(object): def __init__(self): self._raw_endpoint = "http://quiver.archerdx.com/results?query=" self._sesh = CacheControl(requests.Session()) def query(self, query): if len(query) == 0: raise EmptyQueryStringException() q_string = self._raw_endpoint + str(query) response = self._sesh.get(q_string) if response.status_code != 200: raise NetworkErrorException(response.status_code) soup = BeautifulSoup(response.content, "html.parser") # parse the panels panels = soup.find(panel_table_filter) panels_list = [] if panels is not None: for row in panels.find_all("tr"): cells = row.find_all("td") if len(cells) == 2: link = cells[0].a['href'] genes = [clean_string(gene) for gene in cells[1].string.split()] panels_list.append(QuiverFushionPlexPanel(link, genes)) # parse the fusions fusions = soup.find_all(fusion_table_filter) fusions_list = [] if fusions is not None: for fusion in fusions: table = fusion.find('table') for row in table.find_all('tr'): cells = row.find_all('td') if len(cells) != 2: # get the link link = cells[0].a['href'] original_annotation = clean_string(cells[1].string) disease = cells[2].string.strip() pubmed_link = cells[3].a['href'] evidence_count = int(cells[4].string) fusions_list.append(QuiverGeneFushion(link, original_annotation, disease, pubmed_link, evidence_count)) return QuiverResultSet(panels_list, fusions_list, query)
def main(): current = pkg_resources.get_distribution('tst').version if not sys.stdout.isatty(): print(current) return cprint(WHITE, current, file=sys.stdout) try: s = requests.session() s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache'))) response = s.get('https://pypi.org/pypi/tst/json') data = response.json() except requests.ConnectionError: pass latest_version = data['info']['version'] if current != latest_version: cprint(YELLOW, 'Latest version available: %s' % latest_version, file=sys.stdout) cprint(RESET, '---\nUse `pip install --upgrade tst`') cprint(RESET, ' or `pip install --upgrade --user tst`')
class TestHeuristicWith3xxResponse(object): def setup(self): class DummyHeuristic(BaseHeuristic): def update_headers(self, resp): return {"x-dummy-header": "foobar"} self.sess = CacheControl(Session(), heuristic=DummyHeuristic()) def test_heuristic_applies_to_301(self, url): the_url = url + "permanent_redirect" resp = self.sess.get(the_url) assert "x-dummy-header" in resp.headers def test_heuristic_applies_to_304(self, url): the_url = url + "conditional_get" resp = self.sess.get(the_url) assert "x-dummy-header" in resp.headers
def __init__(self, api_key, config = None): check_uwsgi() self._api_key = api_key self._config = config or Config.default() self._session = CacheControl(requests.Session()) self._queue = queue.Queue(self._config._capacity) self._consumer = None self._offline = False self._lock = Lock() self._stream_processor = None if self._config._stream: self._stream_processor = config._stream_processor_class(api_key, config) self._stream_processor.start()
def __init__(self, queue, api_key, config): self._queue = queue """ @type: queue.Queue """ self._session = CacheControl(txrequests.Session()) """ :type: txrequests.Session """ self._api_key = api_key self._config = config """ :type: ldclient.twisted.TwistedConfig """ self._looping_call = None """ :type: LoopingCall"""
def downloadHttpFile(httpurl): # type: (Text) -> Text cache_session = None if "XDG_CACHE_HOME" in os.environ: directory = os.environ["XDG_CACHE_HOME"] elif "HOME" in os.environ: directory = os.environ["HOME"] else: directory = os.path.expanduser('~') cache_session = CacheControl( requests.Session(), cache=FileCache( os.path.join(directory, ".cache", "cwltool"))) r = cache_session.get(httpurl, stream=True) with NamedTemporaryFile(mode='wb', delete=False) as f: for chunk in r.iter_content(chunk_size=16384): if chunk: # filter out keep-alive new chunks f.write(chunk) r.close() return f.name
def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False): """ Returns a byte-string retrieved from the url provider. """ # request session session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(sickbeard.CACHE_DIR, 'sessions'))) # request session headers req_headers = {'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'} if headers: req_headers.update(headers) session.headers.update(req_headers) # request session ssl verify session.verify = False # request session paramaters session.params = params try: # Remove double-slashes from url parsed = list(urlparse.urlparse(url)) parsed[2] = re.sub("/{2,}", "/", parsed[2]) # replace two or more / with one url = urlparse.urlunparse(parsed) # request session proxies if sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) session.proxies = { "http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING, } resp = session.get(url, data=post_data, timeout=timeout) except requests.exceptions.HTTPError, e: logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return
def download_file(url, filename, session=None): # create session session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(sickbeard.CACHE_DIR, 'sessions'))) # request session headers session.headers.update({'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'}) # request session ssl verify session.verify = False # request session streaming session.stream = True # request session proxies if sickbeard.PROXY_SETTING: logger.log("Using proxy for url: " + url, logger.DEBUG) session.proxies = { "http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING, } try: resp = session.get(url) if not resp.ok: return False with open(filename, 'wb') as fp: for chunk in resp.iter_content(chunk_size=1024): if chunk: fp.write(chunk) fp.flush() chmodAsParent(filename) except requests.exceptions.HTTPError, e: _remove_file_failed(filename) logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING) return False
def urls(self): s = requests.session() s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache'))) headers = {} tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json')) token = tokens.get(self.name) try: response = s.get(self.url, allow_redirects=True) except requests.ConnectionError: _assert(False, "Connection failed... check your internet connection") if not response.ok: return None response.encoding = 'utf-8' try: resource = response.json() resource['_response'] = response except ValueError: return None return resource
def get(self, key): s = requests.session() s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache'))) url = "%s/%s" % (self.url, key) headers = {} tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json')) token = tokens.get(self.name) if token: headers['Authorization'] = 'Bearer %s' % token try: response = s.get(url, headers=headers, allow_redirects=True) except requests.ConnectionError: _assert(False, "Connection failed... check your internet connection") if not response.ok: self.last_error = response.status_code self.last_response = response return None response.encoding = 'utf-8' try: resource = response.json() resource['_response'] = response validate_tst_object(resource) except ValueError: #_assert(False, "Resource is not valid json") return None except AssertionError as e: print(resource) _assert(False, "Not a TST Object: %s" % e.message) return resource
def setup(self): self.sess = Session() self.cached_sess = CacheControl(self.sess, heuristic=LastModified())
def main(group_id, location, time_boundary, event_status, pandoc, force): key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key')) if os.path.exists(key_path): with io.open(key_path, encoding='utf8') as fh: key = fh.read().strip() else: key = None cache = FileCache('.web_cache', forever=True) requests = CacheControl(Session(), cache, cache_etags=False, heuristic=ExpiresAfter(days=1)) while True: resp = requests.get('https://api.meetup.com/status', params=dict(key=key)) if resp.status_code == 200 and resp.json().get('status') == 'ok': break elif resp.status_code == 200 and any( 'auth_fail' == e.code for e in resp.json().get('errors', [])): click.echo( 'Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n' ) if click.confirm( 'Open https://secure.meetup.com/meetup_api/key/ in your web browser?' ): click.launch('https://secure.meetup.com/meetup_api/key/') click.echo('') key = click.prompt('Key', hide_input=True) else: raise click.ClickException( 'Failed to get meetup.com status. Response was {!r} {!r}'. format(resp.status_code, resp.text)) click.secho( 'For convenience your key is saved in `{}`.\n'.format(key_path), fg='magenta') with open(key_path, 'w') as fh: fh.write(key) while not location: location = location or get_input( u'Location: ', completer=WordCompleter( [u'cluj', u'iasi', u'timisoara', u'bucuresti'], ignore_case=True)) while True: group_id = group_id or get_input( u'Group ID: ', completer=WordCompleter([ u'RoPython-Bucuresti', u'RoPython-Cluj', u'RoPython_Iasi', u'RoPython-Timisoara' ], ignore_case=True)) resp = requests.get('https://api.meetup.com/2/events', params=dict( key=key, group_urlname=group_id, time=time_boundary, status=event_status, )) if resp.status_code == 200: json = resp.json() if json['results']: break else: click.secho( 'Invalid group `{}`. It has no events!'.format(group_id), fg='red') group_id = None if resp.status_code == '400': click.fail( 'Failed to get make correct request. Response was {!r}'.format( resp.text)) else: click.secho('Invalid group `{}`. Response was [{}] {!r}'.format( group_id, resp.status_code, resp.text), fg='red') # click.echo(pformat(dict(resp.headers))) for event in json['results']: dt = datetime.fromtimestamp(event['time'] / 1000) event['duration'] = format_duration( event.get('duration', 3600000) / 1000) event['time'] = dt.strftime('%Y-%m-%d %H:%M') if 'how_to_find_us' in event: address = event['how_to_find_us'], else: address = () if 'venue' in event: address_1 = event['venue'].get('address_1') if address_1: address += address_1, event['venue']['address_1'] = ', '.join(address) else: event['venue'] = {'address_1': address} click.echo("{time}: {name}".format(**event)) click.echo("\t{}".format(pformat(event))) existing_path = glob( os.path.join('content', '*', dt.strftime('%Y-%m-%d*'), 'index.rst')) if existing_path and not force: if len(existing_path) > 1: click.secho('\tERROR: multiple paths matched: {}'.format( existing_path)) else: click.secho('\t`{}` already exists. Not importing.'.format( *existing_path), fg='yellow') else: target_dir = os.path.join( 'content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'), slugify(event['name']))) target_path = os.path.join(target_dir, 'index.rst') if not os.path.exists(target_dir): os.makedirs(target_dir) if pandoc: with tempfile.NamedTemporaryFile(delete=False) as fh: fh.write(event['description'].encode('utf-8')) rst = subprocess.check_output( ['pandoc', '--from=html', '--to=rst', fh.name]).decode('utf-8') os.unlink(fh.name) else: rst = html2rest(event['description']) doc = u'''{name} ############################################################### :tags: prezentari :registration: meetup.com: {event_url} :start: {time} :duration: {duration} :location: {venue[address_1]}, {venue[city]}, {venue[localized_country_name]} {rst}'''.format(rst=rst, **event) with io.open(target_path, 'w', encoding='utf-8') as fh: fh.write(doc) click.secho('\tWrote `{}`.'.format(target_path), fg='green')
def __init__(self, api_key, config): self._api_key = api_key self._session = CacheControl(requests.Session()) self._config = config
import itertools import os import shutil import subprocess import sys import click import requests from cachecontrol import CacheControl from pipdownload import logger from pipdownload.utils import (TempDirectory, download, get_file_links, mkurl_pypi_url, resolve_package_files) sess = requests.Session() session = CacheControl(sess) @click.command() @click.argument('packages', nargs=-1) @click.option('-i', '--index-url', 'index_url', default='https://pypi.tuna.tsinghua.edu.cn/simple', type=click.STRING, help='Pypi index.') @click.option('-r', '--requirement', 'requirement_file', type=click.File(encoding='utf-8'), help='Requirements File.')
import os import sys import logging import requests import json from cachecontrol import CacheControl from cachecontrol.caches import FileCache logging.basicConfig(stream=sys.stdout, format="%(asctime)s: " + logging.BASIC_FORMAT, datefmt="%Y-%m-%dT%H:%M:%S%z") logger = logging.getLogger(__name__) req = CacheControl( requests.Session(), cache=FileCache(os.path.join('/tmp', 'pyutu.cache')) ) # regions = { # 'ap-northeast-1': "Asia Pacific (Tokyo)", # 'ap-northeast-2': "Asia Pacific (Seoul)", # 'ap-southeast-1': "Asia Pacific (Singapore)", # 'ap-southeast-2': "Asia Pacific (Sydney)", # 'ap-south-1': "Asia Pacific (Mumbai)", # 'ca-central-1': "Canada (Central)", # 'eu-central-1': "EU (Frankfurt)", # 'eu-west-1': "EU (Ireland)", # 'eu-west-2': "EU (London)", # 'sa-east-1': "South America (Sao Paulo)", # 'us-east-1': "US East (N. Virginia)", # 'us-east-2': "US East (Ohio)",
def requests_session(nocache=False): if nocache: return requests.Session() return CacheControl(requests.Session(), cache=FileCache(CACHE_FILENAME))
def __init__(self, settings): self.settings = settings self.handler_config = settings.config[settings.env]['handler'] self.cached_session = CacheControl(requests.session()) self._init_db()
from flask import Flask, render_template, request from bs4 import BeautifulSoup import requests from cachecontrol import CacheControl sessionCached = CacheControl(requests.session()) application = Flask(__name__) @application.route('/', methods=['GET', 'POST']) def index(): r = sessionCached.get('https://www.etax.nat.gov.tw/etw-main/web/ETW183W1/') r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") link = soup.find_all("a", {"href" : lambda s : s and s.startswith("/etw-main/web/ETW183W2_")})[:2] link = [x["href"][-5:] for x in link] month = request.values.get("month") if month is None: month = link[0] chkRadio = [" active" if month == x else "" for x in link] r = sessionCached.get('https://www.etax.nat.gov.tw/etw-main/web/ETW183W2_' + month) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, "html.parser") prize = {} prize[soup.find(id="specialPrize").parent.td.text.strip()] = "特別獎" prize[soup.find(id="grandPrize").parent.td.text.strip()] = "特獎" for e in soup.find(id="firstPrize").parent.td.text.split(): prize[e] = "頭獎" for e in soup.find(id="addSixPrize").parent.td.text.strip().split("、"):
# # WG Notifications of deaths of residents related to COVID-19 in adult care homes from gssutils import * import json import numpy as np if is_interactive(): from requests import Session from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache from cachecontrol.heuristics import ExpiresAfter scrape = Scraper(seed="info.json", session=CacheControl(Session(), cache=FileCache('.cache'), heuristic=ExpiresAfter(days=1))) dist = scrape.distribution( latest=True, title=lambda x: x.startswith( 'Notifications of deaths of residents related to COVID-19')) tabs = {tab.name: tab for tab in dist.as_databaker()} list(tabs) # + def left(s, amount): return s[:amount] def right(s, amount): return s[-amount:]
class LegacyRepository(PyPiRepository): def __init__( self, name, url, auth=None, disable_cache=False ): # type: (str, str, Optional[Auth], bool) -> None if name == "pypi": raise ValueError("The name [pypi] is reserved for repositories") self._packages = [] self._name = name self._url = url.rstrip("/") self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name self._cache = CacheManager( { "default": "releases", "serializer": "json", "stores": { "releases": {"driver": "file", "path": str(self._cache_dir)}, "packages": {"driver": "dict"}, "matches": {"driver": "dict"}, }, } ) self._session = CacheControl( requests.session(), cache=FileCache(str(self._cache_dir / "_http")) ) url_parts = urlparse.urlparse(self._url) if not url_parts.username and auth: self._session.auth = auth self._disable_cache = disable_cache @property def name(self): return self._name def find_packages( self, name, constraint=None, extras=None, allow_prereleases=False ): packages = [] if constraint is None: constraint = "*" if not isinstance(constraint, VersionConstraint): constraint = parse_constraint(constraint) if isinstance(constraint, VersionRange): if ( constraint.max is not None and constraint.max.is_prerelease() or constraint.min is not None and constraint.min.is_prerelease() ): allow_prereleases = True key = name if not constraint.is_any(): key = "{}:{}".format(key, str(constraint)) if self._cache.store("matches").has(key): versions = self._cache.store("matches").get(key) else: page = self._get("/{}/".format(canonicalize_name(name).replace(".", "-"))) if page is None: return [] versions = [] for version in page.versions: if version.is_prerelease() and not allow_prereleases: continue if constraint.allows(version): versions.append(version) self._cache.store("matches").put(key, versions, 5) for version in versions: package = Package(name, version) package.source_type = "legacy" package.source_url = self._url if extras is not None: package.requires_extras = extras packages.append(package) self._log( "{} packages found for {} {}".format(len(packages), name, str(constraint)), level="debug", ) return packages def package( self, name, version, extras=None ): # type: (...) -> poetry.packages.Package """ Retrieve the release information. This is a heavy task which takes time. We have to download a package to get the dependencies. We also need to download every file matching this release to get the various hashes. Note that, this will be cached so the subsequent operations should be much faster. """ try: index = self._packages.index( poetry.packages.Package(name, version, version) ) return self._packages[index] except ValueError: if extras is None: extras = [] release_info = self.get_release_info(name, version) package = poetry.packages.Package(name, version, version) if release_info["requires_python"]: package.python_versions = release_info["requires_python"] package.source_type = "legacy" package.source_url = self._url package.source_reference = self.name requires_dist = release_info["requires_dist"] or [] for req in requires_dist: try: dependency = dependency_from_pep_508(req) except InvalidMarker: # Invalid marker # We strip the markers hoping for the best req = req.split(";")[0] dependency = dependency_from_pep_508(req) except ValueError: # Likely unable to parse constraint so we skip it self._log( "Invalid constraint ({}) found in {}-{} dependencies, " "skipping".format(req, package.name, package.version), level="debug", ) continue if dependency.in_extras: for extra in dependency.in_extras: if extra not in package.extras: package.extras[extra] = [] package.extras[extra].append(dependency) if not dependency.is_optional(): package.requires.append(dependency) # Adding description package.description = release_info.get("summary", "") # Adding hashes information package.hashes = release_info["digests"] # Activate extra dependencies for extra in extras: if extra in package.extras: for dep in package.extras[extra]: dep.activate() package.requires += package.extras[extra] self._packages.append(package) return package def _get_release_info(self, name, version): # type: (str, str) -> dict page = self._get("/{}/".format(canonicalize_name(name).replace(".", "-"))) if page is None: raise PackageNotFound('No package named "{}"'.format(name)) data = { "name": name, "version": version, "summary": "", "requires_dist": [], "requires_python": None, "digests": [], "_cache_version": str(self.CACHE_VERSION), } links = list(page.links_for_version(Version.parse(version))) if not links: raise PackageNotFound( 'No valid distribution links found for package: "{}" version: "{}"'.format( name, version ) ) urls = defaultdict(list) hashes = [] for link in links: if link.is_wheel: urls["bdist_wheel"].append(link.url) elif link.filename.endswith( (".tar.gz", ".zip", ".bz2", ".xz", ".Z", ".tar") ): urls["sdist"].append(link.url) hash = link.hash if link.hash_name == "sha256": hashes.append(hash) else: hashes.append(link.hash_name + ":" + hash) data["digests"] = hashes info = self._get_info_from_urls(urls) data["summary"] = info["summary"] data["requires_dist"] = info["requires_dist"] data["requires_python"] = info["requires_python"] return data def _download(self, url, dest): # type: (str, str) -> None r = self._session.get(url, stream=True) with open(dest, "wb") as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) def _get(self, endpoint): # type: (str) -> Union[Page, None] url = self._url + endpoint response = self._session.get(url) if response.status_code == 404: return return Page(url, response.content, response.headers)
class PyPiRepository(Repository): def __init__(self, url='https://pypi.org/', disable_cache=False, fallback=True): self._name = 'PyPI' self._url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / 'pypi' self._cache = CacheManager({ 'default': 'releases', 'serializer': 'json', 'stores': { 'releases': { 'driver': 'file', 'path': str(release_cache_dir) }, 'packages': { 'driver': 'dict' } } }) self._session = CacheControl( session(), cache=FileCache(str(release_cache_dir / '_http')) ) super(PyPiRepository, self).__init__() def find_packages(self, name, # type: str constraint=None, # type: Union[Constraint, str, None] extras=None, # type: Union[list, None] allow_prereleases=False # type: bool ): # type: (...) -> List[Package] """ Find packages on the remote server. """ if constraint is not None and not isinstance(constraint, BaseConstraint): version_parser = VersionParser() constraint = version_parser.parse_constraints(constraint) info = self.get_package_info(name) packages = [] for version, release in info['releases'].items(): if not release: # Bad release self._log( 'No release information found for {}-{}, skipping'.format( name, version ), level='debug' ) continue package = Package(name, version) if package.is_prerelease() and not allow_prereleases: continue if ( not constraint or (constraint and constraint.matches(Constraint('=', version))) ): if extras is not None: package.requires_extras = extras packages.append(package) self._log( '{} packages found for {} {}'.format( len(packages), name, str(constraint) ), level='debug' ) return packages def package(self, name, # type: str version, # type: str extras=None # type: (Union[list, None]) ): # type: (...) -> Union[Package, None] try: index = self._packages.index(Package(name, version, version)) return self._packages[index] except ValueError: if extras is None: extras = [] release_info = self.get_release_info(name, version) if ( self._fallback and release_info['requires_dist'] is None and not release_info['requires_python'] and '_fallback' not in release_info ): # Force cache update self._log( 'No dependencies found, downloading archives', level='debug' ) self._cache.forget('{}:{}'.format(name, version)) release_info = self.get_release_info(name, version) package = Package(name, version, version) requires_dist = release_info['requires_dist'] or [] for req in requires_dist: try: dependency = dependency_from_pep_508(req) except InvalidMarker: # Invalid marker # We strip the markers hoping for the best req = req.split(';')[0] dependency = dependency_from_pep_508(req) except ValueError: # Likely unable to parse constraint so we skip it self._log( 'Invalid constraint ({}) found in {}-{} dependencies, ' 'skipping'.format( req, package.name, package.version ), level='debug' ) continue if dependency.extras: for extra in dependency.extras: if extra not in package.extras: package.extras[extra] = [] package.extras[extra].append(dependency) if not dependency.is_optional(): package.requires.append(dependency) # Adding description package.description = release_info.get('summary', '') if release_info['requires_python']: package.python_versions = release_info['requires_python'] if release_info['platform']: package.platform = release_info['platform'] # Adding hashes information package.hashes = release_info['digests'] # Activate extra dependencies for extra in extras: if extra in package.extras: for dep in package.extras[extra]: dep.activate() package.requires += package.extras[extra] self._packages.append(package) return package def search(self, query, mode=0): results = [] search = { 'name': query } if mode == self.SEARCH_FULLTEXT: search['summary'] = query client = ServerProxy('https://pypi.python.org/pypi') hits = client.search(search, 'or') for hit in hits: result = Package(hit['name'], hit['version'], hit['version']) result.description = to_str(hit['summary']) results.append(result) return results def get_package_info(self, name): # type: (str) -> dict """ Return the package information given its name. The information is returned from the cache if it exists or retrieved from the remote server. """ if self._disable_cache: return self._get_package_info(name) return self._cache.store('packages').remember_forever( name, lambda: self._get_package_info(name) ) def _get_package_info(self, name): # type: (str) -> dict data = self._get('pypi/{}/json'.format(name)) if data is None: raise ValueError('Package [{}] not found.'.format(name)) return data def get_release_info(self, name, version): # type: (str, str) -> dict """ Return the release information given a package name and a version. The information is returned from the cache if it exists or retrieved from the remote server. """ if self._disable_cache: return self._get_release_info(name, version) return self._cache.remember_forever( '{}:{}'.format(name, version), lambda: self._get_release_info(name, version) ) def _get_release_info(self, name, version): # type: (str, str) -> dict json_data = self._get('pypi/{}/{}/json'.format(name, version)) if json_data is None: raise ValueError('Package [{}] not found.'.format(name)) info = json_data['info'] data = { 'name': info['name'], 'version': info['version'], 'summary': info['summary'], 'platform': info['platform'], 'requires_dist': info['requires_dist'], 'requires_python': info['requires_python'], 'digests': [], '_fallback': False } try: version_info = json_data['releases'][version] except KeyError: version_info = [] for file_info in version_info: data['digests'].append(file_info['digests']['sha256']) if ( self._fallback and data['requires_dist'] is None and not data['requires_python'] ): # No dependencies set (along with other information) # This might be due to actually no dependencies # or badly set metadata when uploading # So, we need to make sure there is actually no # dependencies by introspecting packages data['_fallback'] = True urls = {} for url in json_data['urls']: # Only get sdist and universal wheels dist_type = url['packagetype'] if dist_type not in ['sdist', 'bdist_wheel']: continue if dist_type == 'sdist' and 'dist' not in urls: urls[url['packagetype']] = url['url'] continue if 'bdist_wheel' in urls: continue # If bdist_wheel, check if it's universal python_version = url['python_version'] if python_version not in ['py2.py3', 'py3', 'py2']: continue parts = urlparse.urlparse(url['url']) filename = os.path.basename(parts.path) if '-none-any' not in filename: continue if not urls: return data requires_dist = self._get_requires_dist_from_urls(urls) data['requires_dist'] = requires_dist return data def _get(self, endpoint): # type: (str) -> Union[dict, None] json_response = self._session.get(self._url + endpoint) if json_response.status_code == 404: return None json_data = json_response.json() return json_data def _get_requires_dist_from_urls(self, urls ): # type: (dict) -> Union[list, None] if 'bdist_wheel' in urls: return self._get_requires_dist_from_wheel(urls['bdist_wheek']) return self._get_requires_dist_from_sdist(urls['sdist']) def _get_requires_dist_from_wheel(self, url ): # type: (str) -> Union[list, None] filename = os.path.basename(urlparse.urlparse(url).path) with temporary_directory() as temp_dir: filepath = os.path.join(temp_dir, filename) self._download(url, filepath) try: meta = pkginfo.Wheel(filepath) except ValueError: # Unable to determine dependencies # Assume none return if meta.requires_dist: return meta.requires_dist def _get_requires_dist_from_sdist(self, url ): # type: (str) -> Union[list, None] filename = os.path.basename(urlparse.urlparse(url).path) with temporary_directory() as temp_dir: filepath = Path(temp_dir) / filename self._download(url, str(filepath)) try: meta = pkginfo.SDist(str(filepath)) if meta.requires_dist: return meta.requires_dist except ValueError: # Unable to determine dependencies # We pass and go deeper pass # Still not dependencies found # So, we unpack and introspect suffix = filepath.suffix gz = None if suffix == '.zip': tar = zipfile.ZipFile(str(filepath)) else: if suffix == '.bz2': gz = BZ2File(str(filepath)) else: gz = GzipFile(str(filepath)) tar = tarfile.TarFile(str(filepath), fileobj=gz) try: tar.extractall(os.path.join(temp_dir, 'unpacked')) finally: if gz: gz.close() tar.close() unpacked = Path(temp_dir) / 'unpacked' sdist_dir = unpacked / Path(filename).name.rstrip('.tar.gz') # Checking for .egg-info eggs = list(sdist_dir.glob('*.egg-info')) if eggs: egg_info = eggs[0] requires = egg_info / 'requires.txt' if requires.exists(): with requires.open() as f: return parse_requires(f.read()) return # Still nothing, assume no dependencies # We could probably get them by executing # python setup.py egg-info but I don't feel # confortable executing a file just for the sake # of getting dependencies. return def _download(self, url, dest): # type: (str, str) -> None r = get(url, stream=True) with open(dest, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) def _log(self, msg, level='info'): getattr(logger, level)('{}: {}'.format(self._name, msg))
def main(group_id, location, time_boundary, event_status, pandoc): key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key')) if os.path.exists(key_path): with open(key_path) as fh: key = fh.read().strip() cache = FileCache('.web_cache', forever=True) requests = CacheControl( Session(), cache, cache_etags=False, heuristic=ExpiresAfter(days=1) ) while True: resp = requests.get('https://api.meetup.com/status', params=dict(key=key)) if resp.status_code == 200: break elif resp.status_code == 401: click.echo('Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n') if click.confirm('Open https://secure.meetup.com/meetup_api/key/ in your web browser?'): click.launch('https://secure.meetup.com/meetup_api/key/') click.echo('') key = click.prompt('Key', hide_input=True) else: click.fail('Failed to get meetup.com status. Response was {!r}'.format(resp.text)) click.secho('For convenience your key is saved in `{}`.\n'.format(key_path), fg='magenta') with open(key_path, 'w') as fh: fh.write(key) while not location: location = location or get_input('Location: ', completer=WordCompleter(['cluj', 'iasi', 'timisoara'], ignore_case=True)) while True: group_id = group_id or get_input('Group ID: ', completer=WordCompleter(['Cluj-py', 'RoPython-Timisoara'], ignore_case=True)) resp = requests.get('https://api.meetup.com/2/events', params=dict( key=key, group_urlname=group_id, time=time_boundary, status=event_status, )) if resp.status_code == 200: json = resp.json() if json['results']: break else: click.secho('Invalid group `{}`. It has no events!'.format(group_id), fg='red') group_id = None if resp.status_code == '400': click.fail('Failed to get make correct request. Response was {!r}'.format(resp.text)) else: click.secho('Invalid group `{}`. Response was [{}] {!r}'.format(group_id, resp.status_code, resp.text), fg='red') # click.echo(pformat(dict(resp.headers))) for event in json['results']: dt = datetime.fromtimestamp(event['time']/1000) click.echo("{}: {}".format( dt.strftime('%Y-%m-%d %H:%M:%S'), event['name'] )) existing_path = glob(os.path.join('content', '*', dt.strftime('%Y-%m-%d*'), 'index.rst')) if existing_path: if len(existing_path) > 1: click.secho('\tERROR: multiple paths matched: {}'.format(existing_path)) else: click.secho('\t`{}` already exists. Not importing.'.format(*existing_path), fg='yellow') else: target_dir = os.path.join('content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'), slugify(event['name']))) target_path = os.path.join(target_dir, 'index.rst') if not os.path.exists(target_dir): os.makedirs(target_dir) if pandoc: with tempfile.NamedTemporaryFile(delete=False) as fh: fh.write(event['description'].encode('utf-8')) rst = subprocess.check_output(['pandoc', '--from=html', '--to=rst', fh.name]).decode('utf-8') print fh.name #os.unlink(fh.name) else: stream = StringIO() html2rest(event['description'].encode('utf-8'), writer=stream) rst = stream.getvalue().decode('utf-8') with io.open(target_path, 'w', encoding='utf-8') as fh: fh.write('''{name} ############################################################### :tags: unknown :registration: meetup.com: {event_url} {rst}'''.format(rst=rst, **event)) click.secho('\tWrote `{}`.'.format(target_path), fg='green')
def get_frag_by_loc_from_osm(imtiles_file, loci, zoom_level=0, padding=0, tile_size=256, no_cache=False): width = 360 height = 180 ims = [] prefixes = ['a', 'b', 'c'] prefix_idx = math.floor(random() * len(prefixes)) osm_src = 'http://{}.tile.openstreetmap.org'.format(prefixes[prefix_idx]) s = CacheControl(requests.Session()) for locus in loci: id = locus[-1] if not no_cache: osm_snip = None try: osm_snip = np.load(BytesIO(rdb.get('osm_snip_%s' % id))) if osm_snip is not None: ims.append(osm_snip) continue except: pass start_lng = locus[0] end_lng = locus[1] start_lat = locus[2] end_lat = locus[3] if not is_within(start_lng + 180, end_lng + 180, end_lat + 90, start_lat + 90, width, height): ims.append(None) continue # Get tile ids start1, start2 = get_tile_pos_from_lng_lat(start_lng, start_lat, zoom_level) end1, end2 = get_tile_pos_from_lng_lat(end_lng, end_lat, zoom_level) xPad = padding * (end1 - start1) yPad = padding * (start2 - end2) start1 -= xPad end1 += xPad start2 += yPad end2 -= yPad tile_start1_id = math.floor(start1) tile_start2_id = math.floor(start2) tile_end1_id = math.floor(end1) tile_end2_id = math.floor(end2) start1 = math.floor(start1 * tile_size) start2 = math.floor(start2 * tile_size) end1 = math.ceil(end1 * tile_size) end2 = math.ceil(end2 * tile_size) tiles_x_range = range(tile_start1_id, tile_end1_id + 1) tiles_y_range = range(tile_start2_id, tile_end2_id + 1) # Make sure that no more than 6 standard tiles (256px) are loaded. if tile_size * len(tiles_x_range) > hss.SNIPPET_OSM_MAX_DATA_DIM: raise SnippetTooLarge() if tile_size * len(tiles_y_range) > hss.SNIPPET_OSM_MAX_DATA_DIM: raise SnippetTooLarge() # Extract image tiles tiles = [] for y in tiles_y_range: for x in tiles_x_range: src = ('{}/{}/{}/{}.png'.format(osm_src, zoom_level, x, y)) r = s.get(src) if r.status_code == 200: tiles.append(Image.open(BytesIO(r.content)).convert('RGB')) else: tiles.append(None) osm_snip = get_frag_from_image_tiles(tiles, tile_size, tiles_x_range, tiles_y_range, tile_start1_id, tile_start2_id, start1, end1, start2, end2) if not no_cache: with BytesIO() as b: np.save(b, osm_snip) rdb.set('osm_snip_%s' % id, b.getvalue(), 60 * 30) ims.append(osm_snip) return ims
def main(control_url, test_url, all_available, package_ignore, diff_compose, undercloud_only): """ This script takes two urls for ci log files and compares the rpms installed in each environment. We have named the first a control_url as in a control and experiment to display the diff. The script will pull rpms from ALL the nodes available, and the containers hosted on that node. This workds with both upstream tripleo jobs and infrared job logs. USAGE: The script expects only the base url ( up to the logs dir ) of the logs from any job. e.g. https://logserver.rdoproject.org/foo/check/jobs/7822e6c/logs/ """ diff_builds = DiffBuilds() debug_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s' logging.basicConfig(level=logging.DEBUG, format=debug_format, datefmt='%m-%d %H:%M', filename='debug.log', filemode='w') console = logging.StreamHandler() console.setLevel(logging.INFO) formatter = logging.Formatter(': %(levelname)-8s %(message)s') console.setFormatter(formatter) logging.getLogger('').addHandler(console) sess = requests.session() cached_sess = CacheControl(sess) full_package_diff = {} # default ignore ignore_packages = {".*debuginfo", ".*debugsource", ".*-devel", ".*-doc"} # debug inputs logging.debug("input: control_url: {}".format(control_url)) logging.debug("input: test_url: {}".format(test_url)) logging.debug("input: all_available: {}".format(all_available)) logging.debug("input: package_ignore: {}".format(package_ignore)) logging.debug("input: diff_compose: {}".format(diff_compose)) logging.debug("input: undercloud_only: {}".format(undercloud_only)) if package_ignore: with open(package_ignore) as f: ignore_packages.update(set(f.read().splitlines())) if not all_available and not diff_compose: results = diff_builds.execute_installed_package_diff( cached_sess, control_url, test_url, all_available, ignore_packages, undercloud_only) full_package_diff = results[0] column_list = results[1] elif all_available and not diff_compose: results = diff_builds.execute_repoquery_diff(cached_sess, control_url, test_url, all_available, ignore_packages) full_package_diff = results[0] column_list = results[1] elif diff_compose: results = diff_builds.execute_compose_diff(cached_sess, control_url, test_url, all_available, ignore_packages) full_package_diff = results[0] column_list = results[1] else: print("Error with options provided") logging.info("\n\n **** RESULT **** \n\n") for k in full_package_diff.keys(): diff_builds.display_packages_table(k, column_list, full_package_diff[k])
class Handler(object): def __init__(self, settings): self.settings = settings self.handler_config = settings.config[settings.env]['handler'] self.cached_session = CacheControl(requests.session()) self._init_db() def _get_db_conn(self): return sqlite3.connect(self.handler_config['dbfile']) def _init_db(self): con = self._get_db_conn() cur = con.cursor() stmt = ''' CREATE TABLE IF NOT EXISTS temporary_bounces ( bounced_address TEXT, domain TEXT, counter INTEGER ); ''' cur.execute(stmt.strip()) con.commit() stmt = ''' CREATE TABLE IF NOT EXISTS permanent_bounces ( ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP, bounced_address TEXT, domain TEXT, status_code INTEGER ); ''' cur.execute(stmt.strip()) con.commit() cur.close() con.close() def _increase_bounced_address_counter(self, bounced_address, domain): con = self._get_db_conn() cur = con.cursor() stmt = ''' INSERT OR REPLACE INTO temporary_bounces VALUES (:bounced_address, :domain, COALESCE( (SELECT counter FROM temporary_bounces WHERE bounced_address=:bounced_address AND domain=:domain), 0) + 1); ''' cur.execute(stmt.strip(), { 'bounced_address': bounced_address, 'domain': domain }) con.commit() cur.close() con.close() def _get_bounced_address_counter(self, bounced_address, domain): con = self._get_db_conn() cur = con.cursor() stmt = ''' SELECT counter FROM temporary_bounces WHERE bounced_address=:bounced_address AND domain=:domain; ''' cur.execute(stmt.strip(), { 'bounced_address': bounced_address, 'domain': domain }) row = cur.fetchone() result = 0 if row: result = int(row[0]) cur.close() con.close() return result def _find_address(self, address): con = self._get_db_conn() cur = con.cursor() stmt = ''' SELECT * FROM permanent_bounces WHERE bounced_address LIKE :bounced_address; ''' cur.execute(stmt.strip(), {'bounced_address': '%{0}%'.format(address)}) permanent_bounces = cur.fetchall() stmt = ''' SELECT * FROM temporary_bounces WHERE bounced_address LIKE :bounced_address; ''' cur.execute(stmt.strip(), {'bounced_address': '%{0}%'.format(address)}) temporary_bounces = cur.fetchall() cur.close() con.close() return permanent_bounces, temporary_bounces def _reset_bounced_address(self, bounced_address, domain): con = self._get_db_conn() cur = con.cursor() stmt = ''' DELETE FROM temporary_bounces WHERE bounced_address=:bounced_address AND domain=:domain; ''' cur.execute(stmt.strip(), { 'bounced_address': bounced_address, 'domain': domain }) con.commit() cur.close() con.close() def _set_permanent_bounced_address(self, bounced_address, domain, status_code): con = self._get_db_conn() cur = con.cursor() stmt = ''' INSERT INTO permanent_bounces (bounced_address, domain, status_code) VALUES (:bounced_address, :domain, :status_code); ''' cur.execute( stmt.strip(), { 'bounced_address': bounced_address, 'domain': domain, 'status_code': status_code }) con.commit() cur.close() con.close() def _get_origin_to_domains(self, msg): ''' return the domains to which the origin email was sent ''' to_addresses = [ address for _, address in [parseaddr(x.strip()) for x in msg['To'].split(",")] ] domains = [] for a in to_addresses: parts = tldextract.extract(a.split("@")[1]) domains.append("%s.%s" % (parts[-2], parts[-1])) return domains def _store_permanent_bounced_email(self, bounced_address, body): if not ('permanent_bounced_emails_path' in self.handler_config and body): return dir_path = os.path.join( self.handler_config['permanent_bounced_emails_path'], bounced_address[0:2].lower()) if not os.path.exists(dir_path): os.makedirs(dir_path) path = os.path.join(dir_path, bounced_address + '.gz') content = bytes(body) with gzip.open(path, 'wb') as f: f.write(content) def _handle_out_of_office_message(self, msg): pass def _handle_temporary_bounced_address(self, bounced_address, domain, body): temporary_threshold = self.handler_config['temporary_threshold'] current_counter = self._get_bounced_address_counter( bounced_address, domain) if current_counter > temporary_threshold: self._handle_permanent_bounced_address(bounced_address, domain, body) self._reset_bounced_address(bounced_address, domain) return self._increase_bounced_address_counter(bounced_address, domain) def _default_url_resolver(self, bounced_address, config): tpl = URITemplate(config['base_url']) return tpl.expand(address=bounced_address) def _xikolo_url_resolver(self, bounced_address, config): response = self.cached_session.get(config['base_url']) uri = response.json()['email_suspensions_url'] tpl = URITemplate(uri) return tpl.expand(address=bounced_address) def _handle_permanent_bounced_address(self, bounced_address, domain, body): config = self.handler_config['domains'][domain] if 'url_resolver' in config and config['url_resolver'] == 'xikolo': endpoint = self._xikolo_url_resolver(bounced_address, config) else: endpoint = self._default_url_resolver(bounced_address, config) logger.debug("Post request to: %s for address: %s", endpoint, bounced_address) response = self.cached_session.post(endpoint, data={}) logger.info("Response (%s): %s ", response.status_code, response.text) self._set_permanent_bounced_address(bounced_address, domain, response.status_code) self._store_permanent_bounced_email(bounced_address, body) def set_permanent_bounced_address(self, bounced_address, domain): ''' handles manually bounced email addresses ''' logger.debug("Permanent: %s", bounced_address) self._handle_permanent_bounced_address(bounced_address, domain, '') def find_address(self, address): ''' Find an email address within permanent or temporary bounced emails ''' logger.debug("Find: %s", address) permanent_bounces, temporary_bounces = self._find_address(address) logger.debug('> Permanent bounces for address: "{0}"'.format(address)) for entry in permanent_bounces: logger.debug(entry) logger.debug('> Temporary bounces for address: "{0}"'.format(address)) for entry in temporary_bounces: logger.debug(entry) def handle_message(self, body): ''' handles soft and hard bounced emails ''' msg = email.message_from_bytes(bytes(body)) logger.info("------------- INCOMING MESSAGE -------------") for key, value in msg.items(): if any(key.startswith(h) for h in ['From', 'To', 'Subject']): logger.info("%s:\t%s", key, value) for domain in self._get_origin_to_domains(msg): if domain in self.handler_config['domains'].keys(): break else: raise BouncedEmailException("Domain '%s' not found" % domain) t, p = all_failures(msg) def validate_addresses(bounced_addresses): address_list = [] for address in bounced_addresses: address = address.decode('utf-8') if validate_email(address): address_list.append(address) return address_list temporary = validate_addresses(t) permanent = validate_addresses(p) if not (temporary or permanent): return self._handle_out_of_office_message(msg) logger.info("Domain: %s", domain) for bounced_address in temporary: # sometimes a temporary failure is a permanent failure as well (strange, but yes) if bounced_address in permanent: continue logger.info("Temporary: %s", bounced_address) self._handle_temporary_bounced_address(bounced_address, domain, body) for bounced_address in permanent: logger.info("Permanent: %s", bounced_address) self._handle_permanent_bounced_address(bounced_address, domain, body)
def __init__(self, cache=None, **kw): self._cache = os.path.realpath( os.path.expanduser(cache or self.DEFAULT_CACHE)) super(CachedRequestsContext, self).__init__( CacheControl(requests.session(), cache=FileCache(self._cache)), **kw)
class Entry(Base): __tablename__ = 'entry' id = Column(Integer, primary_key=True) url = Column(String(1024)) expiry = Column(DateTime) response = Column(PickleType) def __repr__(self): return 'Entry[url={}, expiry={}, response={}]'.format( self.url, self.expiry, self.response) engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI']) Session = sessionmaker(bind=engine) req_session = CacheControl(requests.session(), cache=FileCache('.webcache')) def init_db(): Base.metadata.create_all(engine) def fetch(url): try: session = Session() now = datetime.datetime.utcnow() cached = session.query(Entry).filter_by(url=url).first() app.logger.debug('check for cached response %s', cached) if not cached or now >= cached.expiry:
class LegacyRepository(PyPiRepository): def __init__(self, name, url): if name == 'pypi': raise ValueError('The name [pypi] is reserved for repositories') self._packages = [] self._name = name self._url = url.rstrip('/') self._cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / name self._cache = CacheManager({ 'default': 'releases', 'serializer': 'json', 'stores': { 'releases': { 'driver': 'file', 'path': str(self._cache_dir) }, 'packages': { 'driver': 'dict' }, 'matches': { 'driver': 'dict' } } }) self._session = CacheControl(requests.session(), cache=FileCache( str(self._cache_dir / '_http'))) @property def name(self): return self._name def find_packages(self, name, constraint=None, extras=None, allow_prereleases=False): packages = [] if constraint is not None and not isinstance(constraint, VersionConstraint): constraint = parse_constraint(constraint) key = name if constraint: key = '{}:{}'.format(key, str(constraint)) if self._cache.store('matches').has(key): versions = self._cache.store('matches').get(key) else: page = self._get('/{}'.format( canonicalize_name(name).replace('.', '-'))) if page is None: raise ValueError('No package named "{}"'.format(name)) versions = [] for version in page.versions: if (not constraint or (constraint and constraint.allows(version))): versions.append(version) self._cache.store('matches').put(key, versions, 5) for version in versions: package = Package(name, version) if extras is not None: package.requires_extras = extras packages.append(package) return packages def package(self, name, version, extras=None): # type: (...) -> poetry.packages.Package """ Retrieve the release information. This is a heavy task which takes time. We have to download a package to get the dependencies. We also need to download every file matching this release to get the various hashes. Note that, this will be cached so the subsequent operations should be much faster. """ try: index = self._packages.index( poetry.packages.Package(name, version, version)) return self._packages[index] except ValueError: if extras is None: extras = [] release_info = self.get_release_info(name, version) package = poetry.packages.Package(name, version, version) requires_dist = release_info['requires_dist'] or [] for req in requires_dist: try: dependency = dependency_from_pep_508(req) except InvalidMarker: # Invalid marker # We strip the markers hoping for the best req = req.split(';')[0] dependency = dependency_from_pep_508(req) if dependency.extras: for extra in dependency.extras: if extra not in package.extras: package.extras[extra] = [] package.extras[extra].append(dependency) if not dependency.is_optional(): package.requires.append(dependency) # Adding description package.description = release_info.get('summary', '') # Adding hashes information package.hashes = release_info['digests'] # Activate extra dependencies for extra in extras: if extra in package.extras: for dep in package.extras[extra]: dep.activate() package.requires += package.extras[extra] self._packages.append(package) return package def get_release_info(self, name, version): # type: (str, str) -> dict """ Return the release information given a package name and a version. The information is returned from the cache if it exists or retrieved from the remote server. """ return self._cache.store('releases').remember_forever( '{}:{}'.format(name, version), lambda: self._get_release_info(name, version)) def _get_release_info(self, name, version): # type: (str, str) -> dict page = self._get('/{}'.format( canonicalize_name(name).replace('.', '-'))) if page is None: raise ValueError('No package named "{}"'.format(name)) data = { 'name': name, 'version': version, 'summary': '', 'requires_dist': [], 'requires_python': [], 'digests': [] } links = list(page.links_for_version(Version.parse(version))) urls = {} hashes = [] default_link = links[0] for link in links: if link.is_wheel: urls['bdist_wheel'] = link.url elif link.filename.endswith('.tar.gz'): urls['sdist'] = link.url elif link.filename.endswith( ('.zip', '.bz2')) and 'sdist' not in urls: urls['sdist'] = link.url hash = link.hash if link.hash_name == 'sha256': hashes.append(hash) data['digests'] = hashes if not urls: if default_link.is_wheel: m = wheel_file_re.match(default_link.filename) python = m.group('pyver') platform = m.group('plat') if python == 'py2.py3' and platform == 'any': urls['bdist_wheel'] = default_link.url elif default_link.filename.endswith('.tar.gz'): urls['sdist'] = default_link.url elif default_link.filename.endswith( ('.zip', '.bz2')) and 'sdist' not in urls: urls['sdist'] = default_link.url else: return data info = self._get_info_from_urls(urls) data['summary'] = info['summary'] data['requires_dist'] = info['requires_dist'] data['requires_python'] = info['requires_python'] return data def _get(self, endpoint): # type: (str) -> Union[Page, None] url = self._url + endpoint response = self._session.get(url) if response.status_code == 404: return return Page(url, response.content, response.headers)
def eprint(*args, **kwargs): print(*args, file=sys.stderr, **kwargs) def filehash(filename, hashtype, blocksize=65536): hash = hashtype() with open(filename, "rb") as f: for block in iter(lambda: f.read(blocksize), b""): hash.update(block) return hash.hexdigest() forever_cache = FileCache('http_cache', forever=True) sess = CacheControl(requests.Session(), forever_cache) # get the remote version list fragments r = sess.get( 'https://files.minecraftforge.net/maven/net/minecraftforge/forge/maven-metadata.json' ) r.raise_for_status() main_json = r.json() assert type(main_json) == dict r = sess.get( 'https://files.minecraftforge.net/maven/net/minecraftforge/forge/promotions_slim.json' ) r.raise_for_status() promotions_json = r.json() assert type(promotions_json) == dict
import sqlite3 import requests from bs4 import BeautifulSoup from cachecontrol import CacheControl from cachecontrol.caches.file_cache import FileCache sess = CacheControl(requests.Session(), cache=FileCache( 'relaxdays_cache')) # wir wollen den Server ja nicht unnötig foltern standard_url = 'https://relaxdays.de/catalogsearch/result/index/?q={}&product_list_dir=asc&product_list_order=sale_rank&product_list_limit=48' db = sqlite3.connect("daten.db") c = db.cursor() working = [] broken = [] def get_items(category: str): url = standard_url.format(category) r = sess.get(url) soup = BeautifulSoup(r.text, 'lxml') try: warning = soup.find('div', {'class': 'message notice'}) if warning is None: items_html = soup.find('ol', { 'class': 'products list items product-items' }).find_all('li') c.execute( 'CREATE TABLE IF NOT EXISTS "{}" (id NUMERIC UNIQUE, name TEXT, price NUMERIC, url TEXT, image TEXT)' .format(category))
An example of how to transparently cache jikanpy requests using the cachecontrol/requests modules To install: pip install --user cachecontrol[filecache] jikanpy """ import time import requests import jikanpy from cachecontrol import CacheControl from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches.file_cache import FileCache # define heuristic, how long requests should stay in cache # you can modify this to fit whatever you want # it accepts the same kwargs as datetime.timedelta: # https://docs.python.org/3/library/datetime.html#datetime.timedelta expires = ExpiresAfter(days=1) # create session and mount file cache session = CacheControl(requests.Session(), heuristic=expires, cache=FileCache("cache_dir")) # use session for jikanpy j = jikanpy.Jikan(session=session) # the second request here is cached print(j.anime(1)["title"]) print(j.anime(1)["title"])
class RequestsClient(HttpClient): """An implementation of HttpClient that uses Requests as its HTTP Client Attributes: timeout (int): The default timeout for all API requests. """ def __init__(self, timeout=60, cache=False, max_retries=None, retry_interval=None): """The constructor. Args: timeout (float): The default global timeout(seconds). """ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) self.timeout = timeout self.session = requests.session() if max_retries and retry_interval: retries = Retry(total=max_retries, backoff_factor=retry_interval) self.session.mount('http://', HTTPAdapter(max_retries=retries)) self.session.mount('https://', HTTPAdapter(max_retries=retries)) if cache: self.session = CacheControl(self.session) def execute_as_string(self, request): """Execute a given HttpRequest to get a string response back Args: request (HttpRequest): The given HttpRequest to execute. Returns: HttpResponse: The response of the HttpRequest. """ self.session.verify = not Configuration.skip_ssl_verification response = self.session.request(HttpMethodEnum.to_string( request.http_method), request.query_url, headers=request.headers, params=request.query_parameters, data=request.parameters, files=request.files, timeout=self.timeout) return self.convert_response(response, False) def execute_as_binary(self, request): """Execute a given HttpRequest to get a binary response back Args: request (HttpRequest): The given HttpRequest to execute. Returns: HttpResponse: The response of the HttpRequest. """ self.session.verify = not Configuration.skip_ssl_verification response = self.session.request(HttpMethodEnum.to_string( request.http_method), request.query_url, headers=request.headers, params=request.query_parameters, data=request.parameters, files=request.files, timeout=self.timeout) return self.convert_response(response, True) def convert_response(self, response, binary): """Converts the Response object of the HttpClient into an HttpResponse object. Args: response (dynamic): The original response object. Returns: HttpResponse: The converted HttpResponse object. """ if binary: return HttpResponse(response.status_code, response.headers, response.content) else: return HttpResponse(response.status_code, response.headers, response.text)
DEBUG = 1 import ssl import datetime from cachecontrol import CacheControl from cachecontrol.caches import FileCache import tempfile import os USERAGENT = { 'User-agent': 'LibreofficeProjectMacro/0.1 ([email protected])' } CACHE_DIRECTORY = os.path.join(tempfile.gettempdir(), 'LibreofficeProjectMacro') WIKILOGINS = {} cached_session = CacheControl(requests.session(), cache=FileCache(CACHE_DIRECTORY)) cached_session.headers.update(USERAGENT) def writeInformation(level, information): try: global CURRENT_INFORMATION_ROW desktop = XSCRIPTCONTEXT.getDesktop() model = desktop.getCurrentComponent() sheet = config('Information') if not model.Sheets.hasByName(sheet): model.Sheets.insertNewByName(sheet, model.Sheets.getCount() + 1) sheet = model.Sheets.getByName(sheet) try: index = CURRENT_INFORMATION_ROW except NameError:
def setup(self): self.sess = Session() self.cache_sess = CacheControl(self.sess, heuristic=ExpiresAfter(days=1))
class Connection(object): """ Handler for connection and calls to the Open Targets Validation Platform REST API """ _AUTO_GET_TOKEN = 'auto' def __init__(self, host='https://www.targetvalidation.org', port=443, api_version='latest', auth_app_name = None, auth_secret = None, use_http2=False, ): """ Args: host (str): host serving the API port (int): port to use for connection to the API api_version (str): api version to point to, default to 'latest' auth_app_name (str): app_name if using authentication auth_secret (str): secret if using authentication use_http2 (bool): use http2 client """ self._logger = logging.getLogger(__name__) self.host = host self.port = str(port) self.api_version = api_version self.auth_app_name = auth_app_name self.auth_secret = auth_secret if self.auth_app_name and self.auth_secret: self.use_auth = True else: self.use_auth = False self.token = None self.use_http2 = use_http2 session= requests.Session() if self.use_http2: session.mount(host, HTTP20Adapter()) self.session = CacheControl(session) self._get_remote_api_specs() def _build_url(self, endpoint): return '{}:{}/api/{}{}'.format(self.host, self.port, self.api_version, endpoint,) @staticmethod def _auto_detect_post(params): """ Determine if a post request should be made instead of a get depending on the size of the parameters in the request. Args: params (dict): params to pass in the request Returns: Boolean: True if post is needed """ if params: for k,v in params.items(): if isinstance(v, (list, tuple)): if len(v)>3: return True return False def get(self, endpoint, params=None): """ makes a GET request Args: endpoint (str): REST API endpoint to call params (dict): request payload Returns: Response: request response """ if self._auto_detect_post(params): self._logger.debug('switching to POST due to big size of params') return self.post(endpoint, data=params) return Response(self._make_request(endpoint, params=params, method='GET')) def post(self, endpoint, data=None): """ makes a POST request Args: endpoint (str): REST API endpoint to call data (dict): request payload Returns: Response: request response """ return Response(self._make_request(endpoint, data=data, method='POST')) def _make_token_request(self, expire = 60): """ Asks for a token to the API Args: expire (int): expiration time for the token Returns: response for the get token request """ return self._make_request('/public/auth/request_token', params={'app_name':self.auth_app_name, 'secret':self.auth_secret, 'expiry': expire}, headers={'Cache-Control':'no-cache',} ) def get_token(self, expire = 60): """ Asks for a token to the API Args: expire (int): expiration time for the token Returns: str: the token served by the API """ response = self._make_token_request(expire) return response.json()['token'] def _make_request(self, endpoint, params = None, data = None, method = HTTPMethods.GET, headers = {}, rate_limit_fail = False, **kwargs): """ Makes a request to the REST API Args: endpoint (str): endpoint of the REST API params (dict): payload for GET request data (dict): payload for POST request method (HTTPMethods): request method, either HTTPMethods.GET or HTTPMethods.POST. Defaults to HTTPMethods.GET headers (dict): HTTP headers for the request rate_limit_fail (bool): If True raise exception when usage limit is exceeded. If False wait and retry the request. Defaults to False. Keyword Args: **kwargs: forwarded to requests Returns: a response from requests """ def call(): headers['User-agent']='Open Targets Python Client/%s'%str(VERSION) if self.use_http2 and set(headers.keys())&INVALID_HTTP2_HEADERS: for h in INVALID_HTTP2_HEADERS: if h in headers: del headers[h] return self.session.request(method, self._build_url(endpoint), params = params, json = data, headers = headers, **kwargs) 'order params to allow efficient caching' if params is not None: if isinstance(params, dict): params = sorted(params.items()) else: params = sorted(params) if self.use_auth and not 'request_token' in endpoint: if self.token is None: self._update_token() if self.token is not None: headers['Auth-Token']=self.token response = None default_retry_after = 5 if not rate_limit_fail: status_code = 429 while status_code in [429,419]: try: response = call() status_code = response.status_code if status_code == 429: retry_after=default_retry_after if 'Retry-After' in response.headers: retry_after = float(response.headers['Retry-After']) self._logger.warning('Maximum usage limit hit. Retrying in {} seconds'.format(retry_after)) time.sleep(retry_after) elif status_code == 419: self._update_token(force = True) headers['Auth-Token'] = self.token time.sleep(0.5) except MaxRetryError as e: self._logger.exception(e.args[0].reason) self._logger.warning('Problem connecting to the remote API. Retrying in {} seconds'.format(default_retry_after)) time.sleep(default_retry_after) except OSError as e: self._logger.exception(str(e)) self._logger.warning('Problem connecting to the remote API. Retrying in {} seconds'.format(default_retry_after)) time.sleep(default_retry_after) else: response = call() response.raise_for_status() return response def _update_token(self, force = False): """ Update token when expired """ if self.token and not force: token_valid_response = self._make_request('/public/auth/validate_token', headers={'Auth-Token':self.token}) if token_valid_response.status_code == 200: return elif token_valid_response.status_code == 419: pass else: token_valid_response.raise_for_status() self.token = self.get_token() def _get_remote_api_specs(self): """ Fetch and parse REST API documentation """ r= self.session.get(self.host+':'+self.port+'/api/docs/swagger.yaml') r.raise_for_status() self.swagger_yaml = r.text self.api_specs = yaml.load(self.swagger_yaml) self.endpoint_validation_data={} for p, data in self.api_specs['paths'].items(): p=p.split('{')[0] if p[-1]== '/': p=p[:-1] self.endpoint_validation_data[p] = {} for method, method_data in data.items(): if 'parameters' in method_data: params = {} for par in method_data['parameters']: par_type = par.get('type', 'string') params[par['name']]=par_type self.endpoint_validation_data[p][method] = params remote_version = self.get('/public/utils/version').data if remote_version != VERSION: self._logger.warning('The remote server is running the API with version {}, but the client expected {}. They may not be compatible.'.format(remote_version, VERSION)) def validate_parameter(self, endpoint, filter_type, value, method=HTTPMethods.GET): """ Validate payload to send to the REST API based on info fetched from the API documentation Args: endpoint (str): endpoint of the REST API filter_type (str): the parameter sent for the request value: the value sent for the request method (HTTPMethods): request method, either HTTPMethods.GET or HTTPMethods.POST. Defaults to HTTPMethods.GET Raises AttributeError: if validation is not passed """ endpoint_data = self.endpoint_validation_data[endpoint][method] if filter_type in endpoint_data: if endpoint_data[filter_type] == 'string' and isinstance(value, str): return elif endpoint_data[filter_type] == 'boolean' and isinstance(value, bool): return elif endpoint_data[filter_type] == 'number' and isinstance(value, (int, float)): return raise AttributeError('{}={} is not a valid parameter for endpoint {}'.format(filter_type, value, endpoint)) def api_endpoint_docs(self, endpoint): """ Returns the documentation available for a given REST API endpoint Args: endpoint (str): endpoint of the REST API Returns: dict: documentation for the endpoint parsed from YAML docs """ return self.api_specs['paths'][endpoint] def get_api_endpoints(self): """ Get a list of available endpoints Returns: list: available endpoints """ return self.api_specs['paths'].keys() def close(self): """ Close connection to the REST API """ self.session.close() def ping(self): """ Pings the API as a live check Returns: bool: True if pinging the raw response as a ``str`` if the API has a non standard name """ response = self.get('/public/utils/ping') if response.data=='pong': return True elif response.data: return response.data return False
import os import shutil import stat import urllib.request import zipfile from typing import Any, Dict, Optional import requests from cachecontrol import CacheControl, CacheControlAdapter from cachecontrol.caches.file_cache import FileCache from cachecontrol.heuristics import ExpiresAfter from shared import configuration, perf from shared.pd_exception import OperationalException SESSION = CacheControl(requests.Session(), cache=FileCache(configuration.get('web_cache'))) SESSION.mount('http://whatsinstandard.com', CacheControlAdapter(heuristic=ExpiresAfter(days=14))) def unzip(url: str, path: str) -> str: location = '{scratch_dir}/zip'.format( scratch_dir=configuration.get('scratch_dir')) def remove_readonly(func, path, _): os.chmod(path, stat.S_IWRITE) func(path) shutil.rmtree(location, True, remove_readonly) os.mkdir(location) store(url, '{location}/zip.zip'.format(location=location))
def test_expires_heuristic_arg(self): sess = Session() cached_sess = CacheControl(sess, heuristic=Mock()) assert cached_sess
def setup(self): self.sess = Session() self.cached_sess = CacheControl(self.sess, heuristic=OneDayCache())
class PyPiRepository(Repository): CACHE_VERSION = parse_constraint("1.0.0b2") def __init__(self, url="https://pypi.org/", disable_cache=False, fallback=True): self._url = url self._disable_cache = disable_cache self._fallback = fallback release_cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / "pypi" self._cache = CacheManager( { "default": "releases", "serializer": "json", "stores": { "releases": {"driver": "file", "path": str(release_cache_dir)}, "packages": {"driver": "dict"}, }, } ) self._cache_control_cache = FileCache(str(release_cache_dir / "_http")) self._session = CacheControl(session(), cache=self._cache_control_cache) self._inspector = Inspector() super(PyPiRepository, self).__init__() self._name = "PyPI" @property def url(self): # type: () -> str return self._url @property def authenticated_url(self): # type: () -> str return self._url def find_packages( self, name, # type: str constraint=None, # type: Union[VersionConstraint, str, None] extras=None, # type: Union[list, None] allow_prereleases=False, # type: bool ): # type: (...) -> List[Package] """ Find packages on the remote server. """ if constraint is None: constraint = "*" if not isinstance(constraint, VersionConstraint): constraint = parse_constraint(constraint) if isinstance(constraint, VersionRange): if ( constraint.max is not None and constraint.max.is_prerelease() or constraint.min is not None and constraint.min.is_prerelease() ): allow_prereleases = True try: info = self.get_package_info(name) except PackageNotFound: self._log( "No packages found for {} {}".format(name, str(constraint)), level="debug", ) return [] packages = [] for version, release in info["releases"].items(): if not release: # Bad release self._log( "No release information found for {}-{}, skipping".format( name, version ), level="debug", ) continue try: package = Package(name, version) except ParseVersionError: self._log( 'Unable to parse version "{}" for the {} package, skipping'.format( version, name ), level="debug", ) continue if package.is_prerelease() and not allow_prereleases: continue if not constraint or (constraint and constraint.allows(package.version)): if extras is not None: package.requires_extras = extras packages.append(package) self._log( "{} packages found for {} {}".format(len(packages), name, str(constraint)), level="debug", ) return packages def package( self, name, # type: str version, # type: str extras=None, # type: (Union[list, None]) ): # type: (...) -> Union[Package, None] if extras is None: extras = [] release_info = self.get_release_info(name, version) package = Package(name, version, version) requires_dist = release_info["requires_dist"] or [] for req in requires_dist: try: dependency = dependency_from_pep_508(req) except InvalidMarker: # Invalid marker # We strip the markers hoping for the best req = req.split(";")[0] dependency = dependency_from_pep_508(req) except ValueError: # Likely unable to parse constraint so we skip it self._log( "Invalid constraint ({}) found in {}-{} dependencies, " "skipping".format(req, package.name, package.version), level="debug", ) continue if dependency.in_extras: for extra in dependency.in_extras: if extra not in package.extras: package.extras[extra] = [] package.extras[extra].append(dependency) if not dependency.is_optional(): package.requires.append(dependency) # Adding description package.description = release_info.get("summary", "") if release_info["requires_python"]: package.python_versions = release_info["requires_python"] if release_info["platform"]: package.platform = release_info["platform"] # Adding hashes information package.files = release_info["files"] # Activate extra dependencies for extra in extras: if extra in package.extras: for dep in package.extras[extra]: dep.activate() package.requires += package.extras[extra] return package def search(self, query): results = [] search = {"q": query} response = session().get(self._url + "search", params=search) content = parse(response.content, namespaceHTMLElements=False) for result in content.findall(".//*[@class='package-snippet']"): name = result.find("h3/*[@class='package-snippet__name']").text version = result.find("h3/*[@class='package-snippet__version']").text if not name or not version: continue description = result.find("p[@class='package-snippet__description']").text if not description: description = "" try: result = Package(name, version, description) result.description = to_str(description.strip()) results.append(result) except ParseVersionError: self._log( 'Unable to parse version "{}" for the {} package, skipping'.format( version, name ), level="debug", ) return results def get_package_info(self, name): # type: (str) -> dict """ Return the package information given its name. The information is returned from the cache if it exists or retrieved from the remote server. """ if self._disable_cache: return self._get_package_info(name) return self._cache.store("packages").remember_forever( name, lambda: self._get_package_info(name) ) def _get_package_info(self, name): # type: (str) -> dict data = self._get("pypi/{}/json".format(name)) if data is None: raise PackageNotFound("Package [{}] not found.".format(name)) return data def get_release_info(self, name, version): # type: (str, str) -> dict """ Return the release information given a package name and a version. The information is returned from the cache if it exists or retrieved from the remote server. """ if self._disable_cache: return self._get_release_info(name, version) cached = self._cache.remember_forever( "{}:{}".format(name, version), lambda: self._get_release_info(name, version) ) cache_version = cached.get("_cache_version", "0.0.0") if parse_constraint(cache_version) != self.CACHE_VERSION: # The cache must be updated self._log( "The cache for {} {} is outdated. Refreshing.".format(name, version), level="debug", ) cached = self._get_release_info(name, version) self._cache.forever("{}:{}".format(name, version), cached) return cached def _get_release_info(self, name, version): # type: (str, str) -> dict self._log("Getting info for {} ({}) from PyPI".format(name, version), "debug") json_data = self._get("pypi/{}/{}/json".format(name, version)) if json_data is None: raise PackageNotFound("Package [{}] not found.".format(name)) info = json_data["info"] data = { "name": info["name"], "version": info["version"], "summary": info["summary"], "platform": info["platform"], "requires_dist": info["requires_dist"], "requires_python": info["requires_python"], "files": [], "_cache_version": str(self.CACHE_VERSION), } try: version_info = json_data["releases"][version] except KeyError: version_info = [] for file_info in version_info: data["files"].append( { "file": file_info["filename"], "hash": "sha256:" + file_info["digests"]["sha256"], } ) if self._fallback and data["requires_dist"] is None: self._log("No dependencies found, downloading archives", level="debug") # No dependencies set (along with other information) # This might be due to actually no dependencies # or badly set metadata when uploading # So, we need to make sure there is actually no # dependencies by introspecting packages urls = defaultdict(list) for url in json_data["urls"]: # Only get sdist and wheels if they exist dist_type = url["packagetype"] if dist_type not in ["sdist", "bdist_wheel"]: continue urls[dist_type].append(url["url"]) if not urls: return data info = self._get_info_from_urls(urls) data["requires_dist"] = info["requires_dist"] if not data["requires_python"]: data["requires_python"] = info["requires_python"] return data def _get(self, endpoint): # type: (str) -> Union[dict, None] try: json_response = self._session.get(self._url + endpoint) except TooManyRedirects: # Cache control redirect loop. # We try to remove the cache and try again self._cache_control_cache.delete(self._url + endpoint) json_response = self._session.get(self._url + endpoint) if json_response.status_code == 404: return None json_data = json_response.json() return json_data def _get_info_from_urls( self, urls ): # type: (Dict[str, List[str]]) -> Dict[str, Union[str, List, None]] # Checking wheels first as they are more likely to hold # the necessary information if "bdist_wheel" in urls: # Check fo a universal wheel wheels = urls["bdist_wheel"] universal_wheel = None universal_python2_wheel = None universal_python3_wheel = None platform_specific_wheels = [] for wheel in wheels: link = Link(wheel) m = wheel_file_re.match(link.filename) if not m: continue pyver = m.group("pyver") abi = m.group("abi") plat = m.group("plat") if abi == "none" and plat == "any": # Universal wheel if pyver == "py2.py3": # Any Python universal_wheel = wheel elif pyver == "py2": universal_python2_wheel = wheel else: universal_python3_wheel = wheel else: platform_specific_wheels.append(wheel) if universal_wheel is not None: return self._get_info_from_wheel(universal_wheel) info = {} if universal_python2_wheel and universal_python3_wheel: info = self._get_info_from_wheel(universal_python2_wheel) py3_info = self._get_info_from_wheel(universal_python3_wheel) if py3_info["requires_dist"]: if not info["requires_dist"]: info["requires_dist"] = py3_info["requires_dist"] return info py2_requires_dist = set( dependency_from_pep_508(r).to_pep_508() for r in info["requires_dist"] ) py3_requires_dist = set( dependency_from_pep_508(r).to_pep_508() for r in py3_info["requires_dist"] ) base_requires_dist = py2_requires_dist & py3_requires_dist py2_only_requires_dist = py2_requires_dist - py3_requires_dist py3_only_requires_dist = py3_requires_dist - py2_requires_dist # Normalizing requires_dist requires_dist = list(base_requires_dist) for requirement in py2_only_requires_dist: dep = dependency_from_pep_508(requirement) dep.marker = dep.marker.intersect( parse_marker("python_version == '2.7'") ) requires_dist.append(dep.to_pep_508()) for requirement in py3_only_requires_dist: dep = dependency_from_pep_508(requirement) dep.marker = dep.marker.intersect( parse_marker("python_version >= '3'") ) requires_dist.append(dep.to_pep_508()) info["requires_dist"] = sorted(list(set(requires_dist))) if info: return info # Prefer non platform specific wheels if universal_python3_wheel: return self._get_info_from_wheel(universal_python3_wheel) if universal_python2_wheel: return self._get_info_from_wheel(universal_python2_wheel) if platform_specific_wheels and "sdist" not in urls: # Pick the first wheel available and hope for the best return self._get_info_from_wheel(platform_specific_wheels[0]) return self._get_info_from_sdist(urls["sdist"][0]) def _get_info_from_wheel( self, url ): # type: (str) -> Dict[str, Union[str, List, None]] self._log( "Downloading wheel: {}".format(urlparse.urlparse(url).path.rsplit("/")[-1]), level="debug", ) filename = os.path.basename(urlparse.urlparse(url).path.rsplit("/")[-1]) with temporary_directory() as temp_dir: filepath = Path(temp_dir) / filename self._download(url, str(filepath)) return self._inspector.inspect_wheel(filepath) def _get_info_from_sdist( self, url ): # type: (str) -> Dict[str, Union[str, List, None]] self._log( "Downloading sdist: {}".format(urlparse.urlparse(url).path.rsplit("/")[-1]), level="debug", ) filename = os.path.basename(urlparse.urlparse(url).path) with temporary_directory() as temp_dir: filepath = Path(temp_dir) / filename self._download(url, str(filepath)) return self._inspector.inspect_sdist(filepath) def _download(self, url, dest): # type: (str, str) -> None r = get(url, stream=True) r.raise_for_status() with open(dest, "wb") as f: for chunk in r.iter_content(chunk_size=1024): if chunk: f.write(chunk) def _log(self, msg, level="info"): getattr(logger, level)("<comment>{}:</comment> {}".format(self._name, msg))
def main(): # create dirs root_dir = Path(__file__).resolve().parents[1] dump_dir = root_dir / 'dump' mkdirs(dump_dir) # determine search_urls (should be roughly 0.9B words in total) search_urls = [ f'https://www.smashwords.com/books/category/1/downloads/0/free/medium/{i}' for i in range(0, 30000 + 1, 20) ] # get headers (user-agents) headers = get_headers(root_dir / 'user-agents.txt') # initialize cache-controlled session session = CacheControl(Session()) with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor: # get/write book_page_urls book_page_urls = [] with open(dump_dir / 'book_page_urls.txt', 'w') as f: for nb_retry in count(1): # break if all search_urls successful if not search_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: print( f'Could not get {len(search_urls)} search pages after {NB_RETRIES} retries.' ) break # maintain a list of failed searches (for future retries) failed_search_urls = [] # get the search_responses search_responses = list( tqdm(executor.map(get, search_urls, repeat(session), cycle(headers)), total=len(search_urls), desc='Getting searches')) # dump the search_responses dump(search_responses, 'search_responses.pkl') for search_url, search_r in zip(search_urls, search_responses): if search_r is not None: if search_r.status_code == 200: search_r.encoding = 'utf-8' search_tree = html.fromstring(search_r.content) search_tree.make_links_absolute(search_r.url) try: for book_page_url in search_tree.xpath( '//a[@class="library-title"]/@href'): book_page_urls.append(book_page_url) f.write(book_page_url + '\n') except IndexError: failed_search_urls.append(search_url) print(f'Request failed for {search_url}') else: failed_search_urls.append(search_url) print( f'Request failed for {search_url}: status code [{search_r.status_code}]' ) search_urls = failed_search_urls # write book_download_urls.txt with open(root_dir / 'book_download_urls.txt', 'w') as f: for nb_retry in count(1): # break if all book_page_urls successful if not book_page_urls: break # break if max number of retries exceeded if nb_retry > NB_RETRIES: print( f'Could not get {len(book_page_urls)} book pages after {NB_RETRIES} retries.' ) break # maintain a list of failed book_pagees (for future retries) failed_book_page_urls = [] # get the book_page_responses book_page_responses = list( tqdm(executor.map(get, book_page_urls, repeat(session), cycle(headers)), total=len(book_page_urls), desc='Getting book pages')) # dump the book_page_responses dump(book_page_responses, 'book_page_responses.pkl') for book_page_url, book_page_r in zip(book_page_urls, book_page_responses): if book_page_r is not None: if book_page_r.status_code == 200: book_page_r.encoding = 'utf-8' book_page_tree = html.fromstring( book_page_r.content) try: # get relevant data script_text = book_page_tree.xpath( '//div[@id="contentArea"]/script/text()' )[0] _json = json.loads( script_text.split( 'window.angularData.book = ')[1].split( '};')[0] + '}') try: language = _json['language']['name'] if language == 'English': formats = _json['formats'] if 'TXT' in formats: f.write( book_page_tree.xpath( '//a[@title="Plain text; contains no formatting"]/@href' )[0] + '\n') else: continue except KeyError: continue except IndexError: failed_book_page_urls.append(book_page_url) print(f'Request failed for {book_page_url}') else: failed_book_page_urls.append(book_page_url) print( f'Request failed for {book_page_url}: status code [{book_page_r.status_code}]' ) book_page_urls = failed_book_page_urls