Python CacheControl示例，cachecontrol.CacheControl Python示例

示例#1

0

显示文件

文件： util.py 项目： uogbuji/Library.Link

def all_sites(sitemap_url='http://library.link/harvest/sitemap.xml'):
    '''
    >>> from librarylink.util import all_sites
    >>> [ s.host for s in all_sites() if 'denverlibrary' in s.host ]
    ['link.denverlibrary.org']
    '''
    #FIXME: Avoid accumulating all the nodes, which will require improvements to xml.treesequence
    @coroutine
    def sink(accumulator):
        while True:
            e = yield
            loc = next(select_name(e, 'loc'))
            lastmod = next(select_name(e, 'lastmod'))
            s = liblink_site()
            s.sitemap = loc.xml_value
            s.url, _, tail = s.sitemap.partition('harvest/sitemap.xml')
            s.base_url = s.url #Legacy property name
            #Early warning for funky URLs breaking stuff downstream
            assert not tail
            protocol, s.host, path, query, fragment = iri.split_uri_ref(s.sitemap)
            s.lastmod = lastmod.xml_value
            accumulator.append(s)

    nodes = []
    ts = xml.treesequence(('sitemapindex', 'sitemap'), sink(nodes))
    if hasattr (all_sites, 'cachedir'):
        sess = CacheControl(requests.Session(), cache=FileCache(all_sites.cachedir))
    else:
        sess = CacheControl(requests.Session())
    result = sess.get(sitemap_url)
    ts.parse(result.text)
    yield from nodes

示例#2

0

显示文件

文件： test_storage_filecache.py 项目： jaraco/cachecontrol

 def sess(self, url, tmpdir):
     self.url = url
     self.cache = FileCache(str(tmpdir))
     sess = CacheControl(requests.Session(), cache=self.cache)
     yield sess
     # closing session object
     sess.close()

示例#3

0

显示文件

文件： _iosources.py 项目： jhcepas/scikit-bio

    def get_reader(self):
        sess = CacheControl(requests.Session(),
                            cache=FileCache(gettempdir()))
        req = sess.get(self.file)

        # if the response is not 200, an exception will be raised
        req.raise_for_status()

        return io.BufferedReader(io.BytesIO(req.content))

示例#4

0

显示文件

文件： test_etag.py 项目： ionrock/cachecontrol

    def sess(self, url):
        self.etag_url = urljoin(url, "/etag")
        self.update_etag_url = urljoin(url, "/update_etag")
        self.cache = DictCache()
        sess = CacheControl(
            requests.Session(), cache=self.cache, serializer=NullSerializer()
        )
        yield sess

        # closing session object
        sess.close()

示例#5

0

显示文件

文件： caches.py 项目： labsland/labmanager

def get_cached_session():
    sess = CacheControl(requests.Session(),
                    cache=FileCache(CACHE_DIR), heuristic=LastModifiedNoDate(require_date=False))

    original_get = sess.get
    def wrapped_get(*args, **kwargs):
        try:
            return original_get(*args, **kwargs)
        except (OSError, IOError) as e:
            return requests.get(*args, **kwargs)
    sess.get = wrapped_get
    return sess

示例#6

0

显示文件

文件： tst.py 项目： daltonserey/tst

def fetch_file(url, encoding=None):
    s = requests.session()
    s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache')))

    try:
        response = s.get(url, headers={})
    except requests.ConnectionError:
        _assert(False, "Connection failed... check your internet connection")

    _assert(response.ok, "%s\nFile request failed: %s (%d)" % (url, response.reason, response.status_code))
    if encoding:
        response.encoding = encoding

    return response.text

示例#7

0

显示文件

文件： tst.py 项目： daltonserey/tst

    def send_answer(self, answer, key):
        s = requests.session()
        s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache')))

        url = "%s/%s/answers" % (self.url, key)
        data = data2json(answer).encode('utf-8')
        tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json'))
        headers = {"Authorization": "Bearer %s" % tokens.get(self.name)}
        try:
            response = s.post(url, headers=headers, data=data, allow_redirects=True)
        except requests.ConnectionError:
            _assert(False, "Connection failed... check your internet connection (1)")

        return response

示例#8

0

显示文件

文件： test_etag.py 项目： hongquantq92/cachecontrol

    def test_not_modified_releases_connection(self, server):
        sess = CacheControl(requests.Session())
        etag_url = urljoin(server.application_url, "/etag")
        sess.get(etag_url)

        resp = Mock(status=304, headers={})

        # This is how the urllib3 response is created in
        # requests.adapters
        response_mod = "requests.adapters.HTTPResponse.from_httplib"

        with patch(response_mod, Mock(return_value=resp)):
            sess.get(etag_url)
            assert resp.read.called
            assert resp.release_conn.called

示例#9

0

显示文件

文件： test_stream.py 项目： 01-/cachecontrol

class TestStream(object):

    def setup(self):
        self.sess = CacheControl(requests.Session())

    def test_stream_is_cached(self, url):
        resp_1 = self.sess.get(url + 'stream')
        content_1 = resp_1.content

        resp_2 = self.sess.get(url + 'stream')
        content_2  = resp_1.content

        assert not resp_1.from_cache
        assert resp_2.from_cache
        assert content_1 == content_2

示例#10

0

显示文件

文件： __init__.py 项目： pombreda/sandbox

class JSONLocator(Locator):
    def __init__(self, url=PYPI_JSON_URL):
        self.url = url
        self.session = CacheControl(requests.session())
    def versions(self, distribution):
        url = "{}/{}/json".format(self.url, distribution)
        response = self.session.get(url)
        ret = []
        j = response.json()['releases']
        return [v for v, d in j.items() if len(d) > 0]
    def get(self, distribution, version):
        url = "{}/{}/json".format(self.url, distribution)
        response = self.session.get(url)
        # Reformat the data...
        return response.json()['releases'][version]

示例#11

0

显示文件

def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False):
    """
    Returns a byte-string retrieved from the url provider.
    """

    # request session
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, "sessions")))

    # request session headers
    req_headers = {"User-Agent": USER_AGENT, "Accept-Encoding": "gzip,deflate"}
    if headers:
        req_headers.update(headers)
    session.headers.update(req_headers)

    # request session ssl verify
    session.verify = False

    # request session paramaters
    session.params = params

    try:
        # Remove double-slashes from url
        parsed = list(urlparse.urlparse(url))
        parsed[2] = re.sub("/{2,}", "/", parsed[2])  # replace two or more / with one
        url = urlparse.urlunparse(parsed)

        # request session proxies
        if sickbeard.PROXY_SETTING:
            logger.log("Using proxy for url: " + url, logger.DEBUG)
            session.proxies = {"http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING}

        resp = session.get(url, data=post_data, timeout=timeout)
        if not resp.ok:
            logger.log(
                u"Requested url "
                + url
                + " returned status code is "
                + str(resp.status_code)
                + ": "
                + clients.http_error_code[resp.status_code],
                logger.DEBUG,
            )
            return

    except requests.exceptions.HTTPError, e:
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING)
        return

示例#12

0

显示文件

文件： __init__.py 项目： slindstr/python-client

class LDClient(object):

    def __init__(self, apiKey, config=Config.default()):
        self._apiKey = apiKey
        self._config = config
        self._session = CacheControl(requests.Session())

    def get_flag(self, key, user, default=False):
        try:
            return self._get_flag(key, user, default)
        except:
            logging.exception('Unhandled exception in get_flag. Returning default value for flag.')
            return default

    def _get_flag(self, key, user, default):
        hdrs = {'Authorization': 'api_key ' + self._apiKey,
             'User-Agent': 'PythonClient/' + __version__}
        uri = self._config._base_uri + '/api/eval/features/' + key
        r = self._session.get(uri, headers=hdrs, timeout = (self._config._connect, self._config._read))
        dict = r.json()
        val = _evaluate(dict, user)
        if val is None:
            return default
        else:
            return val

示例#13

0

显示文件

文件： legacy_repository.py 项目： singulared/poetry

    def __init__(self, name, url, disable_cache=False):
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name

        self._cache = CacheManager(
            {
                "default": "releases",
                "serializer": "json",
                "stores": {
                    "releases": {"driver": "file", "path": str(self._cache_dir)},
                    "packages": {"driver": "dict"},
                    "matches": {"driver": "dict"},
                },
            }
        )

        self._session = CacheControl(
            requests.session(), cache=FileCache(str(self._cache_dir / "_http"))
        )

        url_parts = urlparse.urlparse(self._url)
        if not url_parts.username:
            self._session.auth = get_http_basic_auth(self.name)

        self._disable_cache = disable_cache

示例#14

0

显示文件

文件： vocadb.py 项目： bomjacob/VocaBot

 def __init__(self):
     self.s = requests.Session()
     # We cache ALL responses for 60 min. so eg. inline lyrics request don't make two calls right after each other.
     # This MAY have unforeseen consequences, but hopefully we can deal with those.
     self.s = CacheControl(self.s, cache_etags=False, heuristic=ExpiresAfter(minutes=60))
     self.s.headers.update({'Accept': 'application/json', 'User-Agent': VOCADB_USER_AGENT})
     self.opts = {'nameMatchMode': 'Auto', 'getTotalCount': 'true'}
     self._resources = {}

示例#15

0

显示文件

文件： test_expires_heuristics.py 项目： ionrock/cachecontrol

    def setup(self):

        class DummyHeuristic(BaseHeuristic):

            def update_headers(self, resp):
                return {"x-dummy-header": "foobar"}

        self.sess = CacheControl(Session(), heuristic=DummyHeuristic())

示例#16

0

显示文件

def download_file(url, filename, session=None):
    # create session
    cache_dir = sickbeard.CACHE_DIR or _getTempDir()
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(cache_dir, "sessions")))

    # request session headers
    session.headers.update({"User-Agent": USER_AGENT, "Accept-Encoding": "gzip,deflate"})

    # request session ssl verify
    session.verify = False

    # request session streaming
    session.stream = True

    # request session proxies
    if sickbeard.PROXY_SETTING:
        logger.log("Using proxy for url: " + url, logger.DEBUG)
        session.proxies = {"http": sickbeard.PROXY_SETTING, "https": sickbeard.PROXY_SETTING}

    try:
        resp = session.get(url)
        if not resp.ok:
            logger.log(
                u"Requested url "
                + url
                + " returned status code is "
                + str(resp.status_code)
                + ": "
                + clients.http_error_code[resp.status_code],
                logger.DEBUG,
            )
            return False

        with open(filename, "wb") as fp:
            for chunk in resp.iter_content(chunk_size=1024):
                if chunk:
                    fp.write(chunk)
                    fp.flush()

        chmodAsParent(filename)
    except requests.exceptions.HTTPError, e:
        _remove_file_failed(filename)
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING)
        return False

示例#17

0

显示文件

文件： util.py 项目： 7924102/scikit-bio

def _get_filehandle(filepath_or, *args, **kwargs):
    """Open file if `filepath_or` looks like a string/unicode/bytes, else
    pass through.
    """
    if _is_string_or_bytes(filepath_or):
        if requests.compat.urlparse(filepath_or).scheme in {'http', 'https'}:
            sess = CacheControl(requests.Session(),
                                cache=FileCache(gettempdir()))
            req = sess.get(filepath_or, **kwargs)

            # if the response is not 200, an exception will be raised
            req.raise_for_status()

            fh, own_fh = BytesIO(req.content), True
        else:
            fh, own_fh = open(filepath_or, *args, **kwargs), True
    else:
        fh, own_fh = filepath_or, False
    return fh, own_fh

示例#18

0

显示文件

文件： test_expires_heuristics.py 项目： ionrock/cachecontrol

    def setup(self):

        class NoopHeuristic(BaseHeuristic):
            warning = Mock()

            def update_headers(self, resp):
                return {}

        self.heuristic = NoopHeuristic()
        self.sess = CacheControl(Session(), heuristic=self.heuristic)

示例#19

0

显示文件

文件： scrapper.py 项目： isoteemu/bookstrong

    def __init__(self):
        try:
            from cachecontrol import CacheControl
            from cachecontrol.caches import FileCache
            import tempfile
            self._requests = CacheControl(self._requests, cache=FileCache(tempfile.gettempdir()+'/cagematch-cache', forever=True))
        except:
            logging.warning('CacheControl not available')

        self._requests.headers.update({'User-Agent': 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'})

示例#20

0

显示文件

文件： __init__.py 项目： vphuc81/MyRepository

def Request(
	url,
	method="GET",
	headers=DEFAULT_HEADERS,
	additional_headers=None,
	data=None,
	session=None,
	allow_redirects=True,
	timeout=10,
	load_cookies=True,
	mobile=False
):
	if additional_headers:
		headers.update(additional_headers)
	try:
		session = CacheControl(session)
	except Exception as e:
		pass
		# Error("Init web cache failed!!!", e)
	if mobile:
		headers["User-Agents"] = MOBILE_IOS_AGENTS
	xbmc.log("Requests headers: {0}".format(json.dumps(headers)), 1)
	if session:
		session.headers.update(headers)
		domain = re.search("https*\://(.+?)($|/)", url).group(1)
		if load_cookies:
			LoadCookies(session, cookies_name=domain)
		if data:
			response = session.post(url, data=data, allow_redirects=allow_redirects, timeout=timeout, verify=False)
		else:
			if method == "HEAD":
				response = session.head(url, allow_redirects=allow_redirects, timeout=timeout, verify=False)
			else:
				response = session.get(url, allow_redirects=allow_redirects, timeout=timeout, verify=False)
		response.encoding = "utf8"
		SaveCookies(session, cookies_name=domain)
		return response
	else:
		if method == "HEAD":
			return requests.head(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout, verify=False)
		else:
			return requests.get(url, headers=headers, allow_redirects=allow_redirects, timeout=timeout, verify=False)

示例#21

0

显示文件

文件： requiver.py 项目： mbiokyle29/reQuiver

class reQuiver(object):

    def __init__(self):
        self._raw_endpoint = "http://quiver.archerdx.com/results?query="
        self._sesh = CacheControl(requests.Session())

    def query(self, query):

        if len(query) == 0:
            raise EmptyQueryStringException()

        q_string = self._raw_endpoint + str(query)
        response = self._sesh.get(q_string)

        if response.status_code != 200:
            raise NetworkErrorException(response.status_code)

        soup = BeautifulSoup(response.content, "html.parser")

        # parse the panels
        panels = soup.find(panel_table_filter)
        panels_list  = []
        
        if panels is not None:
            for row in panels.find_all("tr"):
                cells = row.find_all("td")
                
                if len(cells) == 2:
                    link = cells[0].a['href']
                    genes = [clean_string(gene) for gene in cells[1].string.split()]
                    panels_list.append(QuiverFushionPlexPanel(link, genes))

        # parse the fusions
        fusions = soup.find_all(fusion_table_filter)
        fusions_list = []

        if fusions is not None:
            for fusion in fusions:
                table = fusion.find('table')
                for row in table.find_all('tr'):
                    cells = row.find_all('td')
                    if len(cells) != 2:

                        # get the link
                        link = cells[0].a['href']
                        original_annotation = clean_string(cells[1].string)
                        disease = cells[2].string.strip()
                        pubmed_link = cells[3].a['href']
                        evidence_count = int(cells[4].string)

                        fusions_list.append(QuiverGeneFushion(link, original_annotation, disease,
                                            pubmed_link, evidence_count))

        return QuiverResultSet(panels_list, fusions_list, query)

示例#22

0

显示文件

文件： version.py 项目： daltonserey/tst

def main():
    current = pkg_resources.get_distribution('tst').version
    if not sys.stdout.isatty():
        print(current)
        return

    cprint(WHITE, current, file=sys.stdout)
    try:
        s = requests.session()
        s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache')))
        response = s.get('https://pypi.org/pypi/tst/json')
        data = response.json()
    except requests.ConnectionError:
        pass

    latest_version = data['info']['version']
    if current != latest_version:
        cprint(YELLOW, 'Latest version available: %s' % latest_version, file=sys.stdout)
        cprint(RESET, '---\nUse `pip install --upgrade tst`')
        cprint(RESET, ' or `pip install --upgrade --user tst`')

示例#23

0

显示文件

文件： test_expires_heuristics.py 项目： ionrock/cachecontrol

class TestHeuristicWith3xxResponse(object):

    def setup(self):

        class DummyHeuristic(BaseHeuristic):

            def update_headers(self, resp):
                return {"x-dummy-header": "foobar"}

        self.sess = CacheControl(Session(), heuristic=DummyHeuristic())

    def test_heuristic_applies_to_301(self, url):
        the_url = url + "permanent_redirect"
        resp = self.sess.get(the_url)
        assert "x-dummy-header" in resp.headers

    def test_heuristic_applies_to_304(self, url):
        the_url = url + "conditional_get"
        resp = self.sess.get(the_url)
        assert "x-dummy-header" in resp.headers

示例#24

0

显示文件

文件： __init__.py 项目： rberrelleza/python-client

 def __init__(self, api_key, config = None):
     check_uwsgi()
     self._api_key = api_key
     self._config = config or Config.default()
     self._session = CacheControl(requests.Session())
     self._queue = queue.Queue(self._config._capacity)
     self._consumer = None
     self._offline = False
     self._lock = Lock()
     self._stream_processor = None
     if self._config._stream:
         self._stream_processor = config._stream_processor_class(api_key, config)
         self._stream_processor.start()

示例#25

0

显示文件

文件： twisted_impls.py 项目： trungly/python-client

    def __init__(self, queue, api_key, config):
        self._queue = queue
        """ @type: queue.Queue """

        self._session = CacheControl(txrequests.Session())
        """ :type: txrequests.Session """

        self._api_key = api_key
        self._config = config
        """ :type: ldclient.twisted.TwistedConfig """

        self._looping_call = None
        """ :type: LoopingCall"""

示例#26

0

显示文件

文件： pathmapper.py 项目： denis-yuen/cwltool

def downloadHttpFile(httpurl):
    # type: (Text) -> Text
    cache_session = None
    if "XDG_CACHE_HOME" in os.environ:
        directory = os.environ["XDG_CACHE_HOME"]
    elif "HOME" in os.environ:
        directory = os.environ["HOME"]
    else:
        directory = os.path.expanduser('~')

    cache_session = CacheControl(
        requests.Session(),
        cache=FileCache(
            os.path.join(directory, ".cache", "cwltool")))

    r = cache_session.get(httpurl, stream=True)
    with NamedTemporaryFile(mode='wb', delete=False) as f:
        for chunk in r.iter_content(chunk_size=16384):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
    r.close()
    return f.name

示例#27

0

显示文件

文件： helpers.py 项目： Th3MadHatter/SickRage

def getURL(url, post_data=None, params=None, headers=None, timeout=30, session=None, json=False):
    """
    Returns a byte-string retrieved from the url provider.
    """

    # request session
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(sickbeard.CACHE_DIR, 'sessions')))

    # request session headers
    req_headers = {'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'}
    if headers:
        req_headers.update(headers)
    session.headers.update(req_headers)

    # request session ssl verify
    session.verify = False

    # request session paramaters
    session.params = params

    try:
        # Remove double-slashes from url
        parsed = list(urlparse.urlparse(url))
        parsed[2] = re.sub("/{2,}", "/", parsed[2])  # replace two or more / with one
        url = urlparse.urlunparse(parsed)

        # request session proxies
        if sickbeard.PROXY_SETTING:
            logger.log("Using proxy for url: " + url, logger.DEBUG)
            session.proxies = {
                "http": sickbeard.PROXY_SETTING,
                "https": sickbeard.PROXY_SETTING,
            }

        resp = session.get(url, data=post_data, timeout=timeout)
    except requests.exceptions.HTTPError, e:
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING)
        return

示例#28

0

显示文件

文件： helpers.py 项目： Th3MadHatter/SickRage

def download_file(url, filename, session=None):
    # create session
    session = CacheControl(sess=session, cache=caches.FileCache(os.path.join(sickbeard.CACHE_DIR, 'sessions')))

    # request session headers
    session.headers.update({'User-Agent': USER_AGENT, 'Accept-Encoding': 'gzip,deflate'})

    # request session ssl verify
    session.verify = False

    # request session streaming
    session.stream = True

    # request session proxies
    if sickbeard.PROXY_SETTING:
        logger.log("Using proxy for url: " + url, logger.DEBUG)
        session.proxies = {
            "http": sickbeard.PROXY_SETTING,
            "https": sickbeard.PROXY_SETTING,
        }

    try:
        resp = session.get(url)
        if not resp.ok:
            return False

        with open(filename, 'wb') as fp:
            for chunk in resp.iter_content(chunk_size=1024):
                if chunk:
                    fp.write(chunk)
                    fp.flush()

        chmodAsParent(filename)
    except requests.exceptions.HTTPError, e:
        _remove_file_failed(filename)
        logger.log(u"HTTP error " + str(e.errno) + " while loading URL " + url, logger.WARNING)
        return False

示例#29

0

显示文件

文件： tst.py 项目： daltonserey/tst

    def urls(self):
        s = requests.session()
        s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache')))

        headers = {}
        tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json'))
        token = tokens.get(self.name)
        try:
            response = s.get(self.url, allow_redirects=True)
        except requests.ConnectionError:
            _assert(False, "Connection failed... check your internet connection")

        if not response.ok:
            return None

        response.encoding = 'utf-8'
        try:
            resource = response.json()
            resource['_response'] = response

        except ValueError:
            return None

        return resource

示例#30

0

显示文件

文件： tst.py 项目： daltonserey/tst

    def get(self, key):
        s = requests.session()
        s = CacheControl(s, cache=FileCache(os.path.expanduser('~/.tst/cache')))

        url = "%s/%s" % (self.url, key)
        headers = {}
        tokens = JsonFile(os.path.expanduser('~/.tst/tokens.json'))
        token = tokens.get(self.name)
        if token:
            headers['Authorization'] = 'Bearer %s' % token

        try:
            response = s.get(url, headers=headers, allow_redirects=True)
        except requests.ConnectionError:
            _assert(False, "Connection failed... check your internet connection")

        if not response.ok:
            self.last_error = response.status_code
            self.last_response = response
            return None

        response.encoding = 'utf-8'
        try:
            resource = response.json()
            resource['_response'] = response
            validate_tst_object(resource)

        except ValueError:
            #_assert(False, "Resource is not valid json")
            return None

        except AssertionError as e:
            print(resource)
            _assert(False, "Not a TST Object: %s" % e.message)

        return resource

示例#31

0

显示文件

文件： test_expires_heuristics.py 项目： sybrenstuvel/cachecontrol

 def setup(self):
     self.sess = Session()
     self.cached_sess = CacheControl(self.sess, heuristic=LastModified())

示例#32

0

显示文件

文件： import-meetup.py 项目： RoPython/ropython-site

def main(group_id, location, time_boundary, event_status, pandoc, force):
    key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key'))
    if os.path.exists(key_path):
        with io.open(key_path, encoding='utf8') as fh:
            key = fh.read().strip()
    else:
        key = None
    cache = FileCache('.web_cache', forever=True)
    requests = CacheControl(Session(),
                            cache,
                            cache_etags=False,
                            heuristic=ExpiresAfter(days=1))

    while True:
        resp = requests.get('https://api.meetup.com/status',
                            params=dict(key=key))
        if resp.status_code == 200 and resp.json().get('status') == 'ok':
            break
        elif resp.status_code == 200 and any(
                'auth_fail' == e.code for e in resp.json().get('errors', [])):
            click.echo(
                'Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n'
            )

            if click.confirm(
                    'Open https://secure.meetup.com/meetup_api/key/ in your web browser?'
            ):
                click.launch('https://secure.meetup.com/meetup_api/key/')

            click.echo('')
            key = click.prompt('Key', hide_input=True)
        else:
            raise click.ClickException(
                'Failed to get meetup.com status. Response was {!r} {!r}'.
                format(resp.status_code, resp.text))

    click.secho(
        'For convenience your key is saved in `{}`.\n'.format(key_path),
        fg='magenta')
    with open(key_path, 'w') as fh:
        fh.write(key)

    while not location:
        location = location or get_input(
            u'Location: ',
            completer=WordCompleter(
                [u'cluj', u'iasi', u'timisoara', u'bucuresti'],
                ignore_case=True))

    while True:
        group_id = group_id or get_input(
            u'Group ID: ',
            completer=WordCompleter([
                u'RoPython-Bucuresti', u'RoPython-Cluj', u'RoPython_Iasi',
                u'RoPython-Timisoara'
            ],
                                    ignore_case=True))

        resp = requests.get('https://api.meetup.com/2/events',
                            params=dict(
                                key=key,
                                group_urlname=group_id,
                                time=time_boundary,
                                status=event_status,
                            ))
        if resp.status_code == 200:
            json = resp.json()
            if json['results']:
                break
            else:
                click.secho(
                    'Invalid group `{}`. It has no events!'.format(group_id),
                    fg='red')
                group_id = None
        if resp.status_code == '400':
            click.fail(
                'Failed to get make correct request. Response was {!r}'.format(
                    resp.text))
        else:
            click.secho('Invalid group `{}`. Response was [{}] {!r}'.format(
                group_id, resp.status_code, resp.text),
                        fg='red')

    # click.echo(pformat(dict(resp.headers)))

    for event in json['results']:
        dt = datetime.fromtimestamp(event['time'] / 1000)
        event['duration'] = format_duration(
            event.get('duration', 3600000) / 1000)
        event['time'] = dt.strftime('%Y-%m-%d %H:%M')
        if 'how_to_find_us' in event:
            address = event['how_to_find_us'],
        else:
            address = ()
        if 'venue' in event:
            address_1 = event['venue'].get('address_1')
            if address_1:
                address += address_1,
            event['venue']['address_1'] = ', '.join(address)
        else:
            event['venue'] = {'address_1': address}
        click.echo("{time}: {name}".format(**event))
        click.echo("\t{}".format(pformat(event)))
        existing_path = glob(
            os.path.join('content', '*', dt.strftime('%Y-%m-%d*'),
                         'index.rst'))
        if existing_path and not force:
            if len(existing_path) > 1:
                click.secho('\tERROR: multiple paths matched: {}'.format(
                    existing_path))
            else:
                click.secho('\t`{}` already exists. Not importing.'.format(
                    *existing_path),
                            fg='yellow')
        else:
            target_dir = os.path.join(
                'content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'),
                                                    slugify(event['name'])))
            target_path = os.path.join(target_dir, 'index.rst')
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            if pandoc:
                with tempfile.NamedTemporaryFile(delete=False) as fh:
                    fh.write(event['description'].encode('utf-8'))
                rst = subprocess.check_output(
                    ['pandoc', '--from=html', '--to=rst',
                     fh.name]).decode('utf-8')
                os.unlink(fh.name)
            else:
                rst = html2rest(event['description'])

            doc = u'''{name}
###############################################################

:tags: prezentari
:registration:
    meetup.com: {event_url}
:start: {time}
:duration: {duration}
:location: {venue[address_1]}, {venue[city]}, {venue[localized_country_name]}

{rst}'''.format(rst=rst, **event)
            with io.open(target_path, 'w', encoding='utf-8') as fh:
                fh.write(doc)
            click.secho('\tWrote `{}`.'.format(target_path), fg='green')

示例#33

0

显示文件

文件： requests.py 项目： isabella232/python-client

 def __init__(self, api_key, config):
     self._api_key = api_key
     self._session = CacheControl(requests.Session())
     self._config = config

示例#34

0

显示文件

import itertools
import os
import shutil
import subprocess
import sys

import click
import requests
from cachecontrol import CacheControl

from pipdownload import logger
from pipdownload.utils import (TempDirectory, download, get_file_links,
                               mkurl_pypi_url, resolve_package_files)

sess = requests.Session()
session = CacheControl(sess)


@click.command()
@click.argument('packages', nargs=-1)
@click.option('-i',
              '--index-url',
              'index_url',
              default='https://pypi.tuna.tsinghua.edu.cn/simple',
              type=click.STRING,
              help='Pypi index.')
@click.option('-r',
              '--requirement',
              'requirement_file',
              type=click.File(encoding='utf-8'),
              help='Requirements File.')

示例#35

0

显示文件

文件： client.py 项目： hariprakash-123/aws-pricing

import os
import sys
import logging
import requests
import json
from cachecontrol import CacheControl
from cachecontrol.caches import FileCache


logging.basicConfig(stream=sys.stdout,
                    format="%(asctime)s: " + logging.BASIC_FORMAT,
                    datefmt="%Y-%m-%dT%H:%M:%S%z")
logger = logging.getLogger(__name__)

req = CacheControl(
    requests.Session(),
    cache=FileCache(os.path.join('/tmp', 'pyutu.cache'))
)

# regions = {
#     'ap-northeast-1': "Asia Pacific (Tokyo)",
#     'ap-northeast-2': "Asia Pacific (Seoul)",
#     'ap-southeast-1': "Asia Pacific (Singapore)",
#     'ap-southeast-2': "Asia Pacific (Sydney)",
#     'ap-south-1': "Asia Pacific (Mumbai)",
#     'ca-central-1': "Canada (Central)",
#     'eu-central-1': "EU (Frankfurt)",
#     'eu-west-1': "EU (Ireland)",
#     'eu-west-2': "EU (London)",
#     'sa-east-1': "South America (Sao Paulo)",
#     'us-east-1': "US East (N. Virginia)",
#     'us-east-2': "US East (Ohio)",

示例#36

0

显示文件

文件： utils.py 项目： ekulos/plone.versioncheck

def requests_session(nocache=False):
    if nocache:
        return requests.Session()
    return CacheControl(requests.Session(), cache=FileCache(CACHE_FILENAME))

示例#37

0

显示文件

文件： handler.py 项目： openHPI/bounced_emails

    def __init__(self, settings):
        self.settings = settings
        self.handler_config = settings.config[settings.env]['handler']
        self.cached_session = CacheControl(requests.session())

        self._init_db()

示例#38

0

显示文件

from flask import Flask, render_template, request
from bs4 import BeautifulSoup
import requests
from cachecontrol import CacheControl

sessionCached = CacheControl(requests.session())
application = Flask(__name__)

@application.route('/', methods=['GET', 'POST'])
def index():
    r = sessionCached.get('https://www.etax.nat.gov.tw/etw-main/web/ETW183W1/')
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, "html.parser")
    link = soup.find_all("a", {"href" : lambda s : s and s.startswith("/etw-main/web/ETW183W2_")})[:2]
    link = [x["href"][-5:] for x in link]

    month = request.values.get("month")
    if month is None:
        month = link[0]

    chkRadio = [" active" if month == x else "" for x in link]

    r = sessionCached.get('https://www.etax.nat.gov.tw/etw-main/web/ETW183W2_' + month)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, "html.parser")
    prize = {}
    prize[soup.find(id="specialPrize").parent.td.text.strip()] = "特別獎"
    prize[soup.find(id="grandPrize").parent.td.text.strip()] = "特獎"
    for e in soup.find(id="firstPrize").parent.td.text.split():
        prize[e] = "頭獎"
    for e in soup.find(id="addSixPrize").parent.td.text.strip().split("、"):

示例#39

0

显示文件

# # WG Notifications of deaths of residents related to COVID-19 in adult care homes

from gssutils import *
import json
import numpy as np

if is_interactive():
    from requests import Session
    from cachecontrol import CacheControl
    from cachecontrol.caches.file_cache import FileCache
    from cachecontrol.heuristics import ExpiresAfter
    scrape = Scraper(seed="info.json",
                     session=CacheControl(Session(),
                                          cache=FileCache('.cache'),
                                          heuristic=ExpiresAfter(days=1)))
    dist = scrape.distribution(
        latest=True,
        title=lambda x: x.startswith(
            'Notifications of deaths of residents related to COVID-19'))
    tabs = {tab.name: tab for tab in dist.as_databaker()}
list(tabs)


# +
def left(s, amount):
    return s[:amount]


def right(s, amount):
    return s[-amount:]

示例#40

0

显示文件

文件： legacy_repository.py 项目： thejohnfreeman/poetry

class LegacyRepository(PyPiRepository):
    def __init__(
        self, name, url, auth=None, disable_cache=False
    ):  # type: (str, str, Optional[Auth], bool) -> None
        if name == "pypi":
            raise ValueError("The name [pypi] is reserved for repositories")

        self._packages = []
        self._name = name
        self._url = url.rstrip("/")
        self._cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / name

        self._cache = CacheManager(
            {
                "default": "releases",
                "serializer": "json",
                "stores": {
                    "releases": {"driver": "file", "path": str(self._cache_dir)},
                    "packages": {"driver": "dict"},
                    "matches": {"driver": "dict"},
                },
            }
        )

        self._session = CacheControl(
            requests.session(), cache=FileCache(str(self._cache_dir / "_http"))
        )

        url_parts = urlparse.urlparse(self._url)
        if not url_parts.username and auth:
            self._session.auth = auth

        self._disable_cache = disable_cache

    @property
    def name(self):
        return self._name

    def find_packages(
        self, name, constraint=None, extras=None, allow_prereleases=False
    ):
        packages = []

        if constraint is None:
            constraint = "*"

        if not isinstance(constraint, VersionConstraint):
            constraint = parse_constraint(constraint)

        if isinstance(constraint, VersionRange):
            if (
                constraint.max is not None
                and constraint.max.is_prerelease()
                or constraint.min is not None
                and constraint.min.is_prerelease()
            ):
                allow_prereleases = True

        key = name
        if not constraint.is_any():
            key = "{}:{}".format(key, str(constraint))

        if self._cache.store("matches").has(key):
            versions = self._cache.store("matches").get(key)
        else:
            page = self._get("/{}/".format(canonicalize_name(name).replace(".", "-")))
            if page is None:
                return []

            versions = []
            for version in page.versions:
                if version.is_prerelease() and not allow_prereleases:
                    continue

                if constraint.allows(version):
                    versions.append(version)

            self._cache.store("matches").put(key, versions, 5)

        for version in versions:
            package = Package(name, version)
            package.source_type = "legacy"
            package.source_url = self._url

            if extras is not None:
                package.requires_extras = extras

            packages.append(package)

        self._log(
            "{} packages found for {} {}".format(len(packages), name, str(constraint)),
            level="debug",
        )

        return packages

    def package(
        self, name, version, extras=None
    ):  # type: (...) -> poetry.packages.Package
        """
        Retrieve the release information.

        This is a heavy task which takes time.
        We have to download a package to get the dependencies.
        We also need to download every file matching this release
        to get the various hashes.

        Note that, this will be cached so the subsequent operations
        should be much faster.
        """
        try:
            index = self._packages.index(
                poetry.packages.Package(name, version, version)
            )

            return self._packages[index]
        except ValueError:
            if extras is None:
                extras = []

            release_info = self.get_release_info(name, version)

            package = poetry.packages.Package(name, version, version)
            if release_info["requires_python"]:
                package.python_versions = release_info["requires_python"]

            package.source_type = "legacy"
            package.source_url = self._url
            package.source_reference = self.name

            requires_dist = release_info["requires_dist"] or []
            for req in requires_dist:
                try:
                    dependency = dependency_from_pep_508(req)
                except InvalidMarker:
                    # Invalid marker
                    # We strip the markers hoping for the best
                    req = req.split(";")[0]

                    dependency = dependency_from_pep_508(req)
                except ValueError:
                    # Likely unable to parse constraint so we skip it
                    self._log(
                        "Invalid constraint ({}) found in {}-{} dependencies, "
                        "skipping".format(req, package.name, package.version),
                        level="debug",
                    )
                    continue

                if dependency.in_extras:
                    for extra in dependency.in_extras:
                        if extra not in package.extras:
                            package.extras[extra] = []

                        package.extras[extra].append(dependency)

                if not dependency.is_optional():
                    package.requires.append(dependency)

            # Adding description
            package.description = release_info.get("summary", "")

            # Adding hashes information
            package.hashes = release_info["digests"]

            # Activate extra dependencies
            for extra in extras:
                if extra in package.extras:
                    for dep in package.extras[extra]:
                        dep.activate()

                    package.requires += package.extras[extra]

            self._packages.append(package)

            return package

    def _get_release_info(self, name, version):  # type: (str, str) -> dict
        page = self._get("/{}/".format(canonicalize_name(name).replace(".", "-")))
        if page is None:
            raise PackageNotFound('No package named "{}"'.format(name))

        data = {
            "name": name,
            "version": version,
            "summary": "",
            "requires_dist": [],
            "requires_python": None,
            "digests": [],
            "_cache_version": str(self.CACHE_VERSION),
        }

        links = list(page.links_for_version(Version.parse(version)))
        if not links:
            raise PackageNotFound(
                'No valid distribution links found for package: "{}" version: "{}"'.format(
                    name, version
                )
            )
        urls = defaultdict(list)
        hashes = []
        for link in links:
            if link.is_wheel:
                urls["bdist_wheel"].append(link.url)
            elif link.filename.endswith(
                (".tar.gz", ".zip", ".bz2", ".xz", ".Z", ".tar")
            ):
                urls["sdist"].append(link.url)

            hash = link.hash
            if link.hash_name == "sha256":
                hashes.append(hash)
            else:
                hashes.append(link.hash_name + ":" + hash)

        data["digests"] = hashes

        info = self._get_info_from_urls(urls)

        data["summary"] = info["summary"]
        data["requires_dist"] = info["requires_dist"]
        data["requires_python"] = info["requires_python"]

        return data

    def _download(self, url, dest):  # type: (str, str) -> None
        r = self._session.get(url, stream=True)
        with open(dest, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

    def _get(self, endpoint):  # type: (str) -> Union[Page, None]
        url = self._url + endpoint
        response = self._session.get(url)
        if response.status_code == 404:
            return

        return Page(url, response.content, response.headers)

示例#41

0

显示文件

文件： pypi_repository.py 项目： undu/poetry

class PyPiRepository(Repository):

    def __init__(self,
                 url='https://pypi.org/',
                 disable_cache=False,
                 fallback=True):
        self._name = 'PyPI'
        self._url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / 'pypi'
        self._cache = CacheManager({
            'default': 'releases',
            'serializer': 'json',
            'stores': {
                'releases': {
                    'driver': 'file',
                    'path': str(release_cache_dir)
                },
                'packages': {
                    'driver': 'dict'
                }
            }
        })

        self._session = CacheControl(
            session(),
            cache=FileCache(str(release_cache_dir / '_http'))
        )
        
        super(PyPiRepository, self).__init__()

    def find_packages(self,
                      name,                    # type: str
                      constraint=None,         # type: Union[Constraint, str, None]
                      extras=None,             # type: Union[list, None]
                      allow_prereleases=False  # type: bool
                      ):  # type: (...) -> List[Package]
        """
        Find packages on the remote server.
        """
        if constraint is not None and not isinstance(constraint, BaseConstraint):
            version_parser = VersionParser()
            constraint = version_parser.parse_constraints(constraint)

        info = self.get_package_info(name)

        packages = []

        for version, release in info['releases'].items():
            if not release:
                # Bad release
                self._log(
                    'No release information found for {}-{}, skipping'.format(
                        name, version
                    ),
                    level='debug'
                )
                continue

            package = Package(name, version)

            if package.is_prerelease() and not allow_prereleases:
                continue

            if (
                not constraint
                or (constraint and constraint.matches(Constraint('=', version)))
            ):
                if extras is not None:
                    package.requires_extras = extras

                packages.append(package)

        self._log(
            '{} packages found for {} {}'.format(
                len(packages), name, str(constraint)
            ),
            level='debug'
        )

        return packages

    def package(self,
                name,        # type: str
                version,     # type: str
                extras=None  # type: (Union[list, None])
                ):  # type: (...) -> Union[Package, None]
        try:
            index = self._packages.index(Package(name, version, version))

            return self._packages[index]
        except ValueError:
            if extras is None:
                extras = []

            release_info = self.get_release_info(name, version)
            if (
                self._fallback
                and release_info['requires_dist'] is None
                and not release_info['requires_python']
                and '_fallback' not in release_info
            ):
                # Force cache update
                self._log(
                    'No dependencies found, downloading archives',
                    level='debug'
                )
                self._cache.forget('{}:{}'.format(name, version))
                release_info = self.get_release_info(name, version)

            package = Package(name, version, version)
            requires_dist = release_info['requires_dist'] or []
            for req in requires_dist:
                try:
                    dependency = dependency_from_pep_508(req)
                except InvalidMarker:
                    # Invalid marker
                    # We strip the markers hoping for the best
                    req = req.split(';')[0]

                    dependency = dependency_from_pep_508(req)
                except ValueError:
                    # Likely unable to parse constraint so we skip it
                    self._log(
                        'Invalid constraint ({}) found in {}-{} dependencies, '
                        'skipping'.format(
                            req, package.name, package.version
                        ),
                        level='debug'
                    )
                    continue

                if dependency.extras:
                    for extra in dependency.extras:
                        if extra not in package.extras:
                            package.extras[extra] = []

                        package.extras[extra].append(dependency)

                if not dependency.is_optional():
                    package.requires.append(dependency)

            # Adding description
            package.description = release_info.get('summary', '')

            if release_info['requires_python']:
                package.python_versions = release_info['requires_python']

            if release_info['platform']:
                package.platform = release_info['platform']

            # Adding hashes information
            package.hashes = release_info['digests']

            # Activate extra dependencies
            for extra in extras:
                if extra in package.extras:
                    for dep in package.extras[extra]:
                        dep.activate()

                    package.requires += package.extras[extra]

            self._packages.append(package)

            return package

    def search(self, query, mode=0):
        results = []

        search = {
            'name': query
        }

        if mode == self.SEARCH_FULLTEXT:
            search['summary'] = query

        client = ServerProxy('https://pypi.python.org/pypi')
        hits = client.search(search, 'or')

        for hit in hits:
            result = Package(hit['name'], hit['version'], hit['version'])
            result.description = to_str(hit['summary'])
            results.append(result)

        return results

    def get_package_info(self, name):  # type: (str) -> dict
        """
        Return the package information given its name.

        The information is returned from the cache if it exists
        or retrieved from the remote server.
        """
        if self._disable_cache:
            return self._get_package_info(name)

        return self._cache.store('packages').remember_forever(
            name,
            lambda: self._get_package_info(name)
        )

    def _get_package_info(self, name):  # type: (str) -> dict
        data = self._get('pypi/{}/json'.format(name))
        if data is None:
            raise ValueError('Package [{}] not found.'.format(name))

        return data

    def get_release_info(self, name, version):  # type: (str, str) -> dict
        """
        Return the release information given a package name and a version.

        The information is returned from the cache if it exists
        or retrieved from the remote server.
        """
        if self._disable_cache:
            return self._get_release_info(name, version)

        return self._cache.remember_forever(
            '{}:{}'.format(name, version),
            lambda: self._get_release_info(name, version)
        )

    def _get_release_info(self, name, version):  # type: (str, str) -> dict
        json_data = self._get('pypi/{}/{}/json'.format(name, version))
        if json_data is None:
            raise ValueError('Package [{}] not found.'.format(name))

        info = json_data['info']
        data = {
            'name': info['name'],
            'version': info['version'],
            'summary': info['summary'],
            'platform': info['platform'],
            'requires_dist': info['requires_dist'],
            'requires_python': info['requires_python'],
            'digests': [],
            '_fallback': False
        }

        try:
            version_info = json_data['releases'][version]
        except KeyError:
            version_info = []

        for file_info in version_info:
            data['digests'].append(file_info['digests']['sha256'])

        if (
                self._fallback
                and data['requires_dist'] is None
                and not data['requires_python']
        ):
            # No dependencies set (along with other information)
            # This might be due to actually no dependencies
            # or badly set metadata when uploading
            # So, we need to make sure there is actually no
            # dependencies by introspecting packages
            data['_fallback'] = True

            urls = {}
            for url in json_data['urls']:
                # Only get sdist and universal wheels
                dist_type = url['packagetype']
                if dist_type not in ['sdist', 'bdist_wheel']:
                    continue

                if dist_type == 'sdist' and 'dist' not in urls:
                    urls[url['packagetype']] = url['url']
                    continue

                if 'bdist_wheel' in urls:
                    continue

                # If bdist_wheel, check if it's universal
                python_version = url['python_version']
                if python_version not in ['py2.py3', 'py3', 'py2']:
                    continue

                parts = urlparse.urlparse(url['url'])
                filename = os.path.basename(parts.path)

                if '-none-any' not in filename:
                    continue

            if not urls:
                return data

            requires_dist = self._get_requires_dist_from_urls(urls)

            data['requires_dist'] = requires_dist

        return data

    def _get(self, endpoint):  # type: (str) -> Union[dict, None]
        json_response = self._session.get(self._url + endpoint)
        if json_response.status_code == 404:
            return None

        json_data = json_response.json()

        return json_data

    def _get_requires_dist_from_urls(self, urls
                                     ):  # type: (dict) -> Union[list, None]
        if 'bdist_wheel' in urls:
            return self._get_requires_dist_from_wheel(urls['bdist_wheek'])

        return self._get_requires_dist_from_sdist(urls['sdist'])

    def _get_requires_dist_from_wheel(self, url
                                      ):  # type: (str) -> Union[list, None]
        filename = os.path.basename(urlparse.urlparse(url).path)

        with temporary_directory() as temp_dir:
            filepath = os.path.join(temp_dir, filename)
            self._download(url, filepath)

            try:
                meta = pkginfo.Wheel(filepath)
            except ValueError:
                # Unable to determine dependencies
                # Assume none
                return

        if meta.requires_dist:
            return meta.requires_dist

    def _get_requires_dist_from_sdist(self, url
                                      ):  # type: (str) -> Union[list, None]
        filename = os.path.basename(urlparse.urlparse(url).path)

        with temporary_directory() as temp_dir:
            filepath = Path(temp_dir) / filename
            self._download(url, str(filepath))

            try:
                meta = pkginfo.SDist(str(filepath))

                if meta.requires_dist:
                    return meta.requires_dist
            except ValueError:
                # Unable to determine dependencies
                # We pass and go deeper
                pass

            # Still not dependencies found
            # So, we unpack and introspect
            suffix = filepath.suffix
            gz = None
            if suffix == '.zip':
                tar = zipfile.ZipFile(str(filepath))
            else:
                if suffix == '.bz2':
                    gz = BZ2File(str(filepath))
                else:
                    gz = GzipFile(str(filepath))

                tar = tarfile.TarFile(str(filepath), fileobj=gz)

            try:
                tar.extractall(os.path.join(temp_dir, 'unpacked'))
            finally:
                if gz:
                    gz.close()

                tar.close()

            unpacked = Path(temp_dir) / 'unpacked'
            sdist_dir = unpacked / Path(filename).name.rstrip('.tar.gz')

            # Checking for .egg-info
            eggs = list(sdist_dir.glob('*.egg-info'))
            if eggs:
                egg_info = eggs[0]

                requires = egg_info / 'requires.txt'
                if requires.exists():
                    with requires.open() as f:
                        return parse_requires(f.read())

                return

            # Still nothing, assume no dependencies
            # We could probably get them by executing
            # python setup.py egg-info but I don't feel
            # confortable executing a file just for the sake
            # of getting dependencies.
            return

    def _download(self, url, dest):  # type: (str, str) -> None
        r = get(url, stream=True)
        with open(dest, 'wb') as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

    def _log(self, msg, level='info'):
        getattr(logger, level)('{}: {}'.format(self._name, msg))

示例#42

0

显示文件

def main(group_id, location, time_boundary, event_status, pandoc):
    key_path = os.path.normpath(os.path.expanduser('~/.meetup.com-key'))
    if os.path.exists(key_path):
        with open(key_path) as fh:
            key = fh.read().strip()

    cache = FileCache('.web_cache', forever=True)
    requests = CacheControl(
        Session(), cache,
        cache_etags=False,
        heuristic=ExpiresAfter(days=1)
    )

    while True:
        resp = requests.get('https://api.meetup.com/status', params=dict(key=key))
        if resp.status_code == 200:
            break
        elif resp.status_code == 401:
            click.echo('Your meetup.com key is required. You can get it from https://secure.meetup.com/meetup_api/key/\n')

            if click.confirm('Open https://secure.meetup.com/meetup_api/key/ in your web browser?'):
                click.launch('https://secure.meetup.com/meetup_api/key/')

            click.echo('')
            key = click.prompt('Key', hide_input=True)
        else:
            click.fail('Failed to get meetup.com status. Response was {!r}'.format(resp.text))

    click.secho('For convenience your key is saved in `{}`.\n'.format(key_path), fg='magenta')
    with open(key_path, 'w') as fh:
        fh.write(key)

    while not location:
        location = location or get_input('Location: ', completer=WordCompleter(['cluj', 'iasi', 'timisoara'], ignore_case=True))

    while True:
        group_id = group_id or get_input('Group ID: ', completer=WordCompleter(['Cluj-py', 'RoPython-Timisoara'], ignore_case=True))

        resp = requests.get('https://api.meetup.com/2/events', params=dict(
            key=key,
            group_urlname=group_id,
            time=time_boundary,
            status=event_status,
        ))
        if resp.status_code == 200:
            json = resp.json()
            if json['results']:
                break
            else:
                click.secho('Invalid group `{}`. It has no events!'.format(group_id), fg='red')
                group_id = None
        if resp.status_code == '400':
            click.fail('Failed to get make correct request. Response was {!r}'.format(resp.text))
        else:
            click.secho('Invalid group `{}`. Response was [{}] {!r}'.format(group_id, resp.status_code, resp.text), fg='red')

    # click.echo(pformat(dict(resp.headers)))

    for event in json['results']:
        dt = datetime.fromtimestamp(event['time']/1000)
        click.echo("{}: {}".format(
            dt.strftime('%Y-%m-%d %H:%M:%S'),
            event['name']
        ))
        existing_path = glob(os.path.join('content', '*', dt.strftime('%Y-%m-%d*'), 'index.rst'))
        if existing_path:
            if len(existing_path) > 1:
                click.secho('\tERROR: multiple paths matched: {}'.format(existing_path))
            else:
                click.secho('\t`{}` already exists. Not importing.'.format(*existing_path), fg='yellow')
        else:
            target_dir = os.path.join('content', location, '{}-{}'.format(dt.strftime('%Y-%m-%d'), slugify(event['name'])))
            target_path = os.path.join(target_dir, 'index.rst')
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)

            if pandoc:
                with tempfile.NamedTemporaryFile(delete=False) as fh:
                    fh.write(event['description'].encode('utf-8'))
                rst = subprocess.check_output(['pandoc', '--from=html', '--to=rst', fh.name]).decode('utf-8')
                print fh.name
                #os.unlink(fh.name)
            else:
                stream = StringIO()
                html2rest(event['description'].encode('utf-8'), writer=stream)
                rst = stream.getvalue().decode('utf-8')

            with io.open(target_path, 'w', encoding='utf-8') as fh:
                fh.write('''{name}
###############################################################

:tags: unknown
:registration:
    meetup.com: {event_url}

{rst}'''.format(rst=rst, **event))
            click.secho('\tWrote `{}`.'.format(target_path), fg='green')

示例#43

0

显示文件

文件： utils.py 项目： dbmi-bgm/cgap-higlass-server

def get_frag_by_loc_from_osm(imtiles_file,
                             loci,
                             zoom_level=0,
                             padding=0,
                             tile_size=256,
                             no_cache=False):
    width = 360
    height = 180

    ims = []

    prefixes = ['a', 'b', 'c']
    prefix_idx = math.floor(random() * len(prefixes))
    osm_src = 'http://{}.tile.openstreetmap.org'.format(prefixes[prefix_idx])

    s = CacheControl(requests.Session())

    for locus in loci:
        id = locus[-1]

        if not no_cache:
            osm_snip = None
            try:
                osm_snip = np.load(BytesIO(rdb.get('osm_snip_%s' % id)))
                if osm_snip is not None:
                    ims.append(osm_snip)
                    continue
            except:
                pass

        start_lng = locus[0]
        end_lng = locus[1]
        start_lat = locus[2]
        end_lat = locus[3]

        if not is_within(start_lng + 180, end_lng + 180, end_lat + 90,
                         start_lat + 90, width, height):
            ims.append(None)
            continue

        # Get tile ids
        start1, start2 = get_tile_pos_from_lng_lat(start_lng, start_lat,
                                                   zoom_level)
        end1, end2 = get_tile_pos_from_lng_lat(end_lng, end_lat, zoom_level)

        xPad = padding * (end1 - start1)
        yPad = padding * (start2 - end2)

        start1 -= xPad
        end1 += xPad
        start2 += yPad
        end2 -= yPad

        tile_start1_id = math.floor(start1)
        tile_start2_id = math.floor(start2)
        tile_end1_id = math.floor(end1)
        tile_end2_id = math.floor(end2)

        start1 = math.floor(start1 * tile_size)
        start2 = math.floor(start2 * tile_size)
        end1 = math.ceil(end1 * tile_size)
        end2 = math.ceil(end2 * tile_size)

        tiles_x_range = range(tile_start1_id, tile_end1_id + 1)
        tiles_y_range = range(tile_start2_id, tile_end2_id + 1)

        # Make sure that no more than 6 standard tiles (256px) are loaded.
        if tile_size * len(tiles_x_range) > hss.SNIPPET_OSM_MAX_DATA_DIM:
            raise SnippetTooLarge()
        if tile_size * len(tiles_y_range) > hss.SNIPPET_OSM_MAX_DATA_DIM:
            raise SnippetTooLarge()

        # Extract image tiles
        tiles = []
        for y in tiles_y_range:
            for x in tiles_x_range:
                src = ('{}/{}/{}/{}.png'.format(osm_src, zoom_level, x, y))

                r = s.get(src)

                if r.status_code == 200:
                    tiles.append(Image.open(BytesIO(r.content)).convert('RGB'))
                else:
                    tiles.append(None)

        osm_snip = get_frag_from_image_tiles(tiles, tile_size, tiles_x_range,
                                             tiles_y_range, tile_start1_id,
                                             tile_start2_id, start1, end1,
                                             start2, end2)

        if not no_cache:
            with BytesIO() as b:
                np.save(b, osm_snip)
                rdb.set('osm_snip_%s' % id, b.getvalue(), 60 * 30)

        ims.append(osm_snip)

    return ims

示例#44

0

显示文件

文件： diff_builds.py 项目： weshayutin/ci-config

def main(control_url, test_url, all_available, package_ignore, diff_compose,
         undercloud_only):
    """
    This script takes two urls for ci log files and compares the rpms
    installed in each environment.  We have named the first a control_url as
    in a control and experiment to display the diff.

    The script will pull rpms from ALL the nodes available, and the containers
    hosted on that node. This workds with both upstream tripleo jobs and
    infrared job logs.

    USAGE:
        The script expects only the base url ( up to the logs dir ) of the
        logs from any job.
        e.g. https://logserver.rdoproject.org/foo/check/jobs/7822e6c/logs/

    """

    diff_builds = DiffBuilds()

    debug_format = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    logging.basicConfig(level=logging.DEBUG,
                        format=debug_format,
                        datefmt='%m-%d %H:%M',
                        filename='debug.log',
                        filemode='w')
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter(': %(levelname)-8s %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)

    sess = requests.session()
    cached_sess = CacheControl(sess)
    full_package_diff = {}
    # default ignore
    ignore_packages = {".*debuginfo", ".*debugsource", ".*-devel", ".*-doc"}

    # debug inputs
    logging.debug("input: control_url: {}".format(control_url))
    logging.debug("input: test_url: {}".format(test_url))
    logging.debug("input: all_available: {}".format(all_available))
    logging.debug("input: package_ignore: {}".format(package_ignore))
    logging.debug("input: diff_compose: {}".format(diff_compose))
    logging.debug("input: undercloud_only: {}".format(undercloud_only))

    if package_ignore:
        with open(package_ignore) as f:
            ignore_packages.update(set(f.read().splitlines()))

    if not all_available and not diff_compose:
        results = diff_builds.execute_installed_package_diff(
            cached_sess, control_url, test_url, all_available, ignore_packages,
            undercloud_only)
        full_package_diff = results[0]
        column_list = results[1]

    elif all_available and not diff_compose:
        results = diff_builds.execute_repoquery_diff(cached_sess, control_url,
                                                     test_url, all_available,
                                                     ignore_packages)
        full_package_diff = results[0]
        column_list = results[1]

    elif diff_compose:
        results = diff_builds.execute_compose_diff(cached_sess, control_url,
                                                   test_url, all_available,
                                                   ignore_packages)
        full_package_diff = results[0]
        column_list = results[1]

    else:
        print("Error with options provided")

    logging.info("\n\n **** RESULT **** \n\n")
    for k in full_package_diff.keys():
        diff_builds.display_packages_table(k, column_list,
                                           full_package_diff[k])

示例#45

0

显示文件

文件： handler.py 项目： openHPI/bounced_emails

class Handler(object):
    def __init__(self, settings):
        self.settings = settings
        self.handler_config = settings.config[settings.env]['handler']
        self.cached_session = CacheControl(requests.session())

        self._init_db()

    def _get_db_conn(self):
        return sqlite3.connect(self.handler_config['dbfile'])

    def _init_db(self):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        CREATE TABLE IF NOT EXISTS temporary_bounces
            (
                bounced_address TEXT,
                domain TEXT,
                counter INTEGER
            );
        '''
        cur.execute(stmt.strip())
        con.commit()

        stmt = '''
        CREATE TABLE IF NOT EXISTS permanent_bounces
            (
                ts TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                bounced_address TEXT,
                domain TEXT,
                status_code INTEGER
            );
        '''
        cur.execute(stmt.strip())
        con.commit()

        cur.close()
        con.close()

    def _increase_bounced_address_counter(self, bounced_address, domain):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        INSERT OR REPLACE INTO temporary_bounces
            VALUES (:bounced_address, :domain,
            COALESCE(
                (SELECT counter FROM temporary_bounces
                    WHERE bounced_address=:bounced_address AND domain=:domain),
                0) + 1);
        '''
        cur.execute(stmt.strip(), {
            'bounced_address': bounced_address,
            'domain': domain
        })
        con.commit()

        cur.close()
        con.close()

    def _get_bounced_address_counter(self, bounced_address, domain):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        SELECT counter FROM temporary_bounces
            WHERE bounced_address=:bounced_address AND domain=:domain;
        '''
        cur.execute(stmt.strip(), {
            'bounced_address': bounced_address,
            'domain': domain
        })
        row = cur.fetchone()
        result = 0
        if row:
            result = int(row[0])

        cur.close()
        con.close()
        return result

    def _find_address(self, address):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        SELECT * FROM permanent_bounces
            WHERE bounced_address LIKE :bounced_address;
        '''
        cur.execute(stmt.strip(), {'bounced_address': '%{0}%'.format(address)})
        permanent_bounces = cur.fetchall()

        stmt = '''
        SELECT * FROM temporary_bounces
            WHERE bounced_address LIKE :bounced_address;
        '''
        cur.execute(stmt.strip(), {'bounced_address': '%{0}%'.format(address)})
        temporary_bounces = cur.fetchall()

        cur.close()
        con.close()
        return permanent_bounces, temporary_bounces

    def _reset_bounced_address(self, bounced_address, domain):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        DELETE FROM temporary_bounces
            WHERE bounced_address=:bounced_address AND domain=:domain;
        '''
        cur.execute(stmt.strip(), {
            'bounced_address': bounced_address,
            'domain': domain
        })
        con.commit()

        cur.close()
        con.close()

    def _set_permanent_bounced_address(self, bounced_address, domain,
                                       status_code):
        con = self._get_db_conn()
        cur = con.cursor()
        stmt = '''
        INSERT INTO permanent_bounces (bounced_address, domain, status_code)
            VALUES (:bounced_address, :domain, :status_code);
        '''
        cur.execute(
            stmt.strip(), {
                'bounced_address': bounced_address,
                'domain': domain,
                'status_code': status_code
            })
        con.commit()

        cur.close()
        con.close()

    def _get_origin_to_domains(self, msg):
        '''
        return the domains to which the origin email was sent
        '''
        to_addresses = [
            address for _, address in
            [parseaddr(x.strip()) for x in msg['To'].split(",")]
        ]
        domains = []
        for a in to_addresses:
            parts = tldextract.extract(a.split("@")[1])
            domains.append("%s.%s" % (parts[-2], parts[-1]))
        return domains

    def _store_permanent_bounced_email(self, bounced_address, body):
        if not ('permanent_bounced_emails_path' in self.handler_config
                and body):
            return

        dir_path = os.path.join(
            self.handler_config['permanent_bounced_emails_path'],
            bounced_address[0:2].lower())

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)

        path = os.path.join(dir_path, bounced_address + '.gz')
        content = bytes(body)
        with gzip.open(path, 'wb') as f:
            f.write(content)

    def _handle_out_of_office_message(self, msg):
        pass

    def _handle_temporary_bounced_address(self, bounced_address, domain, body):
        temporary_threshold = self.handler_config['temporary_threshold']
        current_counter = self._get_bounced_address_counter(
            bounced_address, domain)

        if current_counter > temporary_threshold:
            self._handle_permanent_bounced_address(bounced_address, domain,
                                                   body)
            self._reset_bounced_address(bounced_address, domain)
            return

        self._increase_bounced_address_counter(bounced_address, domain)

    def _default_url_resolver(self, bounced_address, config):
        tpl = URITemplate(config['base_url'])
        return tpl.expand(address=bounced_address)

    def _xikolo_url_resolver(self, bounced_address, config):
        response = self.cached_session.get(config['base_url'])
        uri = response.json()['email_suspensions_url']
        tpl = URITemplate(uri)
        return tpl.expand(address=bounced_address)

    def _handle_permanent_bounced_address(self, bounced_address, domain, body):
        config = self.handler_config['domains'][domain]

        if 'url_resolver' in config and config['url_resolver'] == 'xikolo':
            endpoint = self._xikolo_url_resolver(bounced_address, config)
        else:
            endpoint = self._default_url_resolver(bounced_address, config)

        logger.debug("Post request to: %s for address: %s", endpoint,
                     bounced_address)

        response = self.cached_session.post(endpoint, data={})
        logger.info("Response (%s): %s ", response.status_code, response.text)

        self._set_permanent_bounced_address(bounced_address, domain,
                                            response.status_code)
        self._store_permanent_bounced_email(bounced_address, body)

    def set_permanent_bounced_address(self, bounced_address, domain):
        '''
        handles manually bounced email addresses
        '''
        logger.debug("Permanent: %s", bounced_address)
        self._handle_permanent_bounced_address(bounced_address, domain, '')

    def find_address(self, address):
        '''
        Find an email address within permanent or temporary bounced emails
        '''
        logger.debug("Find: %s", address)
        permanent_bounces, temporary_bounces = self._find_address(address)

        logger.debug('> Permanent bounces for address: "{0}"'.format(address))
        for entry in permanent_bounces:
            logger.debug(entry)

        logger.debug('> Temporary bounces for address: "{0}"'.format(address))
        for entry in temporary_bounces:
            logger.debug(entry)

    def handle_message(self, body):
        '''
        handles soft and hard bounced emails
        '''
        msg = email.message_from_bytes(bytes(body))
        logger.info("------------- INCOMING MESSAGE -------------")
        for key, value in msg.items():
            if any(key.startswith(h) for h in ['From', 'To', 'Subject']):
                logger.info("%s:\t%s", key, value)

        for domain in self._get_origin_to_domains(msg):
            if domain in self.handler_config['domains'].keys():
                break
        else:
            raise BouncedEmailException("Domain '%s' not found" % domain)

        t, p = all_failures(msg)

        def validate_addresses(bounced_addresses):
            address_list = []
            for address in bounced_addresses:
                address = address.decode('utf-8')
                if validate_email(address):
                    address_list.append(address)
            return address_list

        temporary = validate_addresses(t)
        permanent = validate_addresses(p)

        if not (temporary or permanent):
            return self._handle_out_of_office_message(msg)

        logger.info("Domain: %s", domain)

        for bounced_address in temporary:
            # sometimes a temporary failure is a permanent failure as well (strange, but yes)
            if bounced_address in permanent:
                continue
            logger.info("Temporary: %s", bounced_address)
            self._handle_temporary_bounced_address(bounced_address, domain,
                                                   body)

        for bounced_address in permanent:
            logger.info("Permanent: %s", bounced_address)
            self._handle_permanent_bounced_address(bounced_address, domain,
                                                   body)

示例#46

0

显示文件

文件： http.py 项目： vibhatha/twister2

 def __init__(self, cache=None, **kw):
     self._cache = os.path.realpath(
         os.path.expanduser(cache or self.DEFAULT_CACHE))
     super(CachedRequestsContext, self).__init__(
         CacheControl(requests.session(), cache=FileCache(self._cache)),
         **kw)

示例#47

0

显示文件

文件： forkontext.py 项目： kylewm/forkontext

class Entry(Base):
    __tablename__ = 'entry'
    id = Column(Integer, primary_key=True)
    url = Column(String(1024))
    expiry = Column(DateTime)
    response = Column(PickleType)

    def __repr__(self):
        return 'Entry[url={}, expiry={}, response={}]'.format(
            self.url, self.expiry, self.response)


engine = create_engine(app.config['SQLALCHEMY_DATABASE_URI'])
Session = sessionmaker(bind=engine)

req_session = CacheControl(requests.session(), cache=FileCache('.webcache'))


def init_db():
    Base.metadata.create_all(engine)


def fetch(url):
    try:
        session = Session()
        now = datetime.datetime.utcnow()
        cached = session.query(Entry).filter_by(url=url).first()

        app.logger.debug('check for cached response %s', cached)

        if not cached or now >= cached.expiry:

示例#48

0

显示文件

class LegacyRepository(PyPiRepository):
    def __init__(self, name, url):
        if name == 'pypi':
            raise ValueError('The name [pypi] is reserved for repositories')

        self._packages = []
        self._name = name
        self._url = url.rstrip('/')
        self._cache_dir = Path(CACHE_DIR) / 'cache' / 'repositories' / name

        self._cache = CacheManager({
            'default': 'releases',
            'serializer': 'json',
            'stores': {
                'releases': {
                    'driver': 'file',
                    'path': str(self._cache_dir)
                },
                'packages': {
                    'driver': 'dict'
                },
                'matches': {
                    'driver': 'dict'
                }
            }
        })

        self._session = CacheControl(requests.session(),
                                     cache=FileCache(
                                         str(self._cache_dir / '_http')))

    @property
    def name(self):
        return self._name

    def find_packages(self,
                      name,
                      constraint=None,
                      extras=None,
                      allow_prereleases=False):
        packages = []

        if constraint is not None and not isinstance(constraint,
                                                     VersionConstraint):
            constraint = parse_constraint(constraint)

        key = name
        if constraint:
            key = '{}:{}'.format(key, str(constraint))

        if self._cache.store('matches').has(key):
            versions = self._cache.store('matches').get(key)
        else:
            page = self._get('/{}'.format(
                canonicalize_name(name).replace('.', '-')))
            if page is None:
                raise ValueError('No package named "{}"'.format(name))

            versions = []
            for version in page.versions:
                if (not constraint
                        or (constraint and constraint.allows(version))):
                    versions.append(version)

            self._cache.store('matches').put(key, versions, 5)

        for version in versions:
            package = Package(name, version)
            if extras is not None:
                package.requires_extras = extras

            packages.append(package)

        return packages

    def package(self,
                name,
                version,
                extras=None):  # type: (...) -> poetry.packages.Package
        """
        Retrieve the release information.

        This is a heavy task which takes time.
        We have to download a package to get the dependencies.
        We also need to download every file matching this release
        to get the various hashes.
        
        Note that, this will be cached so the subsequent operations
        should be much faster.
        """
        try:
            index = self._packages.index(
                poetry.packages.Package(name, version, version))

            return self._packages[index]
        except ValueError:
            if extras is None:
                extras = []

            release_info = self.get_release_info(name, version)

            package = poetry.packages.Package(name, version, version)
            requires_dist = release_info['requires_dist'] or []
            for req in requires_dist:
                try:
                    dependency = dependency_from_pep_508(req)
                except InvalidMarker:
                    # Invalid marker
                    # We strip the markers hoping for the best
                    req = req.split(';')[0]

                    dependency = dependency_from_pep_508(req)

                if dependency.extras:
                    for extra in dependency.extras:
                        if extra not in package.extras:
                            package.extras[extra] = []

                        package.extras[extra].append(dependency)

                if not dependency.is_optional():
                    package.requires.append(dependency)

            # Adding description
            package.description = release_info.get('summary', '')

            # Adding hashes information
            package.hashes = release_info['digests']

            # Activate extra dependencies
            for extra in extras:
                if extra in package.extras:
                    for dep in package.extras[extra]:
                        dep.activate()

                    package.requires += package.extras[extra]

            self._packages.append(package)

            return package

    def get_release_info(self, name, version):  # type: (str, str) -> dict
        """
        Return the release information given a package name and a version.

        The information is returned from the cache if it exists
        or retrieved from the remote server.
        """
        return self._cache.store('releases').remember_forever(
            '{}:{}'.format(name, version),
            lambda: self._get_release_info(name, version))

    def _get_release_info(self, name, version):  # type: (str, str) -> dict
        page = self._get('/{}'.format(
            canonicalize_name(name).replace('.', '-')))
        if page is None:
            raise ValueError('No package named "{}"'.format(name))

        data = {
            'name': name,
            'version': version,
            'summary': '',
            'requires_dist': [],
            'requires_python': [],
            'digests': []
        }

        links = list(page.links_for_version(Version.parse(version)))
        urls = {}
        hashes = []
        default_link = links[0]
        for link in links:
            if link.is_wheel:
                urls['bdist_wheel'] = link.url
            elif link.filename.endswith('.tar.gz'):
                urls['sdist'] = link.url
            elif link.filename.endswith(
                ('.zip', '.bz2')) and 'sdist' not in urls:
                urls['sdist'] = link.url

            hash = link.hash
            if link.hash_name == 'sha256':
                hashes.append(hash)

        data['digests'] = hashes

        if not urls:
            if default_link.is_wheel:
                m = wheel_file_re.match(default_link.filename)
                python = m.group('pyver')
                platform = m.group('plat')
                if python == 'py2.py3' and platform == 'any':
                    urls['bdist_wheel'] = default_link.url
            elif default_link.filename.endswith('.tar.gz'):
                urls['sdist'] = default_link.url
            elif default_link.filename.endswith(
                ('.zip', '.bz2')) and 'sdist' not in urls:
                urls['sdist'] = default_link.url
            else:
                return data

        info = self._get_info_from_urls(urls)

        data['summary'] = info['summary']
        data['requires_dist'] = info['requires_dist']
        data['requires_python'] = info['requires_python']

        return data

    def _get(self, endpoint):  # type: (str) -> Union[Page, None]
        url = self._url + endpoint
        response = self._session.get(url)
        if response.status_code == 404:
            return

        return Page(url, response.content, response.headers)

示例#49

0

显示文件


def eprint(*args, **kwargs):
    print(*args, file=sys.stderr, **kwargs)


def filehash(filename, hashtype, blocksize=65536):
    hash = hashtype()
    with open(filename, "rb") as f:
        for block in iter(lambda: f.read(blocksize), b""):
            hash.update(block)
    return hash.hexdigest()


forever_cache = FileCache('http_cache', forever=True)
sess = CacheControl(requests.Session(), forever_cache)

# get the remote version list fragments
r = sess.get(
    'https://files.minecraftforge.net/maven/net/minecraftforge/forge/maven-metadata.json'
)
r.raise_for_status()
main_json = r.json()
assert type(main_json) == dict

r = sess.get(
    'https://files.minecraftforge.net/maven/net/minecraftforge/forge/promotions_slim.json'
)
r.raise_for_status()
promotions_json = r.json()
assert type(promotions_json) == dict

示例#50

0

显示文件

文件： crawler_relaxdays.py 项目： rd-hackathon/geschenkfinder

import sqlite3

import requests
from bs4 import BeautifulSoup
from cachecontrol import CacheControl
from cachecontrol.caches.file_cache import FileCache

sess = CacheControl(requests.Session(), cache=FileCache(
    'relaxdays_cache'))  # wir wollen den Server ja nicht unnötig foltern
standard_url = 'https://relaxdays.de/catalogsearch/result/index/?q={}&product_list_dir=asc&product_list_order=sale_rank&product_list_limit=48'

db = sqlite3.connect("daten.db")
c = db.cursor()

working = []
broken = []


def get_items(category: str):
    url = standard_url.format(category)
    r = sess.get(url)
    soup = BeautifulSoup(r.text, 'lxml')
    try:
        warning = soup.find('div', {'class': 'message notice'})
        if warning is None:
            items_html = soup.find('ol', {
                'class': 'products list items product-items'
            }).find_all('li')
            c.execute(
                'CREATE TABLE IF NOT EXISTS "{}" (id NUMERIC UNIQUE, name TEXT, price NUMERIC, url TEXT, image TEXT)'
                .format(category))

示例#51

0

显示文件

文件： jikanpy_cachecontrol.py 项目： jikan-me/examples

An example of how to transparently cache jikanpy requests using the
cachecontrol/requests modules
To install:
pip install --user cachecontrol[filecache] jikanpy
"""

import time

import requests
import jikanpy
from cachecontrol import CacheControl
from cachecontrol.heuristics import ExpiresAfter
from cachecontrol.caches.file_cache import FileCache

# define heuristic, how long requests should stay in cache
# you can modify this to fit whatever you want
# it accepts the same kwargs as datetime.timedelta:
# https://docs.python.org/3/library/datetime.html#datetime.timedelta
expires = ExpiresAfter(days=1)

# create session and mount file cache
session = CacheControl(requests.Session(), heuristic=expires, cache=FileCache("cache_dir"))

# use session for jikanpy
j = jikanpy.Jikan(session=session)

# the second request here is cached
print(j.anime(1)["title"])
print(j.anime(1)["title"])

示例#52

0

显示文件

文件： requests_client.py 项目： naveena-maplelabs/management-sdk-python

class RequestsClient(HttpClient):
    """An implementation of HttpClient that uses Requests as its HTTP Client

    Attributes:
        timeout (int): The default timeout for all API requests.

    """
    def __init__(self,
                 timeout=60,
                 cache=False,
                 max_retries=None,
                 retry_interval=None):
        """The constructor.

        Args:
            timeout (float): The default global timeout(seconds).

        """
        urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
        self.timeout = timeout
        self.session = requests.session()

        if max_retries and retry_interval:
            retries = Retry(total=max_retries, backoff_factor=retry_interval)
            self.session.mount('http://', HTTPAdapter(max_retries=retries))
            self.session.mount('https://', HTTPAdapter(max_retries=retries))

        if cache:
            self.session = CacheControl(self.session)

    def execute_as_string(self, request):
        """Execute a given HttpRequest to get a string response back

        Args:
            request (HttpRequest): The given HttpRequest to execute.

        Returns:
            HttpResponse: The response of the HttpRequest.

        """
        self.session.verify = not Configuration.skip_ssl_verification

        response = self.session.request(HttpMethodEnum.to_string(
            request.http_method),
                                        request.query_url,
                                        headers=request.headers,
                                        params=request.query_parameters,
                                        data=request.parameters,
                                        files=request.files,
                                        timeout=self.timeout)

        return self.convert_response(response, False)

    def execute_as_binary(self, request):
        """Execute a given HttpRequest to get a binary response back

        Args:
            request (HttpRequest): The given HttpRequest to execute.

        Returns:
            HttpResponse: The response of the HttpRequest.

        """
        self.session.verify = not Configuration.skip_ssl_verification

        response = self.session.request(HttpMethodEnum.to_string(
            request.http_method),
                                        request.query_url,
                                        headers=request.headers,
                                        params=request.query_parameters,
                                        data=request.parameters,
                                        files=request.files,
                                        timeout=self.timeout)

        return self.convert_response(response, True)

    def convert_response(self, response, binary):
        """Converts the Response object of the HttpClient into an
        HttpResponse object.

        Args:
            response (dynamic): The original response object.

        Returns:
            HttpResponse: The converted HttpResponse object.

        """
        if binary:
            return HttpResponse(response.status_code, response.headers,
                                response.content)
        else:
            return HttpResponse(response.status_code, response.headers,
                                response.text)

示例#53

0

显示文件

文件： wikitest.py 项目： ReedtheRiver/localModelScripts

    DEBUG = 1
import ssl
import datetime
from cachecontrol import CacheControl
from cachecontrol.caches import FileCache
import tempfile
import os

USERAGENT = {
    'User-agent': 'LibreofficeProjectMacro/0.1 ([email protected])'
}

CACHE_DIRECTORY = os.path.join(tempfile.gettempdir(),
                               'LibreofficeProjectMacro')
WIKILOGINS = {}
cached_session = CacheControl(requests.session(),
                              cache=FileCache(CACHE_DIRECTORY))
cached_session.headers.update(USERAGENT)


def writeInformation(level, information):
    try:
        global CURRENT_INFORMATION_ROW
        desktop = XSCRIPTCONTEXT.getDesktop()
        model = desktop.getCurrentComponent()
        sheet = config('Information')
        if not model.Sheets.hasByName(sheet):
            model.Sheets.insertNewByName(sheet, model.Sheets.getCount() + 1)
        sheet = model.Sheets.getByName(sheet)
        try:
            index = CURRENT_INFORMATION_ROW
        except NameError:

示例#54

0

显示文件

文件： test_expires_heuristics.py 项目： sybrenstuvel/cachecontrol

 def setup(self):
     self.sess = Session()
     self.cache_sess = CacheControl(self.sess,
                                    heuristic=ExpiresAfter(days=1))

示例#55

0

显示文件

文件： conn.py 项目： mkarmona/opentargets-py

class Connection(object):
    """
    Handler for connection and calls to the Open Targets Validation Platform REST API
    """

    _AUTO_GET_TOKEN = 'auto'

    def __init__(self,
                 host='https://www.targetvalidation.org',
                 port=443,
                 api_version='latest',
                 auth_app_name = None,
                 auth_secret = None,
                 use_http2=False,
                 ):
        """
        Args:
            host (str): host serving the API
            port (int): port to use for connection to the API
            api_version (str): api version to point to, default to 'latest'
            auth_app_name (str): app_name if using authentication
            auth_secret (str): secret if using authentication
            use_http2 (bool): use http2 client
        """
        self._logger = logging.getLogger(__name__)
        self.host = host
        self.port = str(port)
        self.api_version = api_version
        self.auth_app_name = auth_app_name
        self.auth_secret = auth_secret
        if self.auth_app_name and self.auth_secret:
            self.use_auth = True
        else:
            self.use_auth = False
        self.token = None
        self.use_http2 = use_http2
        session= requests.Session()
        if self.use_http2:
            session.mount(host, HTTP20Adapter())
        self.session = CacheControl(session)
        self._get_remote_api_specs()



    def _build_url(self, endpoint):
        return '{}:{}/api/{}{}'.format(self.host,
                                       self.port,
                                       self.api_version,
                                       endpoint,)
    @staticmethod
    def _auto_detect_post(params):
        """
        Determine if a post request should be made instead of a get depending on the size of the parameters
        in the request.

        Args:
            params (dict): params to pass in the request

        Returns:
            Boolean: True if post is needed
        """
        if params:
            for k,v in params.items():
                if isinstance(v, (list, tuple)):
                    if len(v)>3:
                        return True
        return False

    def get(self, endpoint, params=None):
        """
        makes a GET request
        Args:
            endpoint (str): REST API endpoint to call
            params (dict): request payload

        Returns:
            Response: request response
        """
        if self._auto_detect_post(params):
            self._logger.debug('switching to POST due to big size of params')
            return self.post(endpoint, data=params)
        return Response(self._make_request(endpoint,
                              params=params,
                              method='GET'))

    def post(self, endpoint, data=None):
        """
        makes a POST request
        Args:
            endpoint (str): REST API endpoint to call
            data (dict): request payload

        Returns:
            Response: request response
        """
        return Response(self._make_request(endpoint,
                               data=data,
                               method='POST'))


    def _make_token_request(self, expire = 60):
        """
        Asks for a token to the API
        Args:
            expire (int): expiration time for the token

        Returns:
            response for the get token request
        """
        return self._make_request('/public/auth/request_token',
                                  params={'app_name':self.auth_app_name,
                                        'secret':self.auth_secret,
                                        'expiry': expire},
                                  headers={'Cache-Control':'no-cache',}
                                  )

    def get_token(self, expire = 60):
        """
        Asks for a token to the API
        Args:
            expire (int): expiration time for the token

        Returns:
            str: the token served by the API
        """
        response = self._make_token_request(expire)
        return response.json()['token']

    def _make_request(self,
                      endpoint,
                      params = None,
                      data = None,
                      method = HTTPMethods.GET,
                      headers = {},
                      rate_limit_fail = False,
                      **kwargs):
        """
        Makes a request to the REST API
        Args:
            endpoint (str): endpoint of the REST API
            params (dict): payload for GET request
            data (dict): payload for POST request
            method (HTTPMethods): request method, either HTTPMethods.GET or HTTPMethods.POST. Defaults to HTTPMethods.GET
            headers (dict): HTTP headers for the request
            rate_limit_fail (bool): If True raise exception when usage limit is exceeded. If False wait and
                retry the request. Defaults to False.
        Keyword Args:
            **kwargs: forwarded to requests

        Returns:
            a response from requests
        """

        def call():
            headers['User-agent']='Open Targets Python Client/%s'%str(VERSION)
            if self.use_http2 and set(headers.keys())&INVALID_HTTP2_HEADERS:
                for h in INVALID_HTTP2_HEADERS:
                    if h in headers:
                        del headers[h]
            return self.session.request(method,
                                    self._build_url(endpoint),
                                    params = params,
                                    json = data,
                                    headers = headers,
                                    **kwargs)

        'order params to allow efficient caching'
        if params is not None:
            if isinstance(params, dict):
                params = sorted(params.items())
            else:
                params = sorted(params)

        if self.use_auth and not 'request_token' in endpoint:
            if self.token is None:
                self._update_token()
            if self.token is not None:
                headers['Auth-Token']=self.token

        response = None
        default_retry_after = 5
        if not rate_limit_fail:
            status_code = 429
            while status_code in [429,419]:
                try:
                    response = call()
                    status_code = response.status_code
                    if status_code == 429:
                        retry_after=default_retry_after
                        if 'Retry-After' in response.headers:
                            retry_after = float(response.headers['Retry-After'])
                        self._logger.warning('Maximum usage limit hit. Retrying in {} seconds'.format(retry_after))
                        time.sleep(retry_after)
                    elif status_code == 419:
                        self._update_token(force = True)
                        headers['Auth-Token'] = self.token
                        time.sleep(0.5)
                except MaxRetryError as e:
                    self._logger.exception(e.args[0].reason)
                    self._logger.warning('Problem connecting to the remote API. Retrying in {} seconds'.format(default_retry_after))
                    time.sleep(default_retry_after)
                except OSError as e:
                    self._logger.exception(str(e))
                    self._logger.warning('Problem connecting to the remote API. Retrying in {} seconds'.format(default_retry_after))
                    time.sleep(default_retry_after)


        else:
            response = call()

        response.raise_for_status()
        return response

    def _update_token(self, force = False):
        """
        Update token when expired
        """
        if self.token and not force:
            token_valid_response = self._make_request('/public/auth/validate_token',
                                                       headers={'Auth-Token':self.token})
            if token_valid_response.status_code == 200:
                return
            elif token_valid_response.status_code == 419:
                pass
            else:
                token_valid_response.raise_for_status()

        self.token = self.get_token()

    def _get_remote_api_specs(self):
        """
        Fetch and parse REST API documentation
        """
        r= self.session.get(self.host+':'+self.port+'/api/docs/swagger.yaml')
        r.raise_for_status()
        self.swagger_yaml = r.text
        self.api_specs = yaml.load(self.swagger_yaml)
        self.endpoint_validation_data={}
        for p, data in self.api_specs['paths'].items():
            p=p.split('{')[0]
            if p[-1]== '/':
                p=p[:-1]
            self.endpoint_validation_data[p] = {}
            for method, method_data in data.items():
                if 'parameters' in method_data:
                    params = {}
                    for par in method_data['parameters']:
                        par_type = par.get('type', 'string')
                        params[par['name']]=par_type
                    self.endpoint_validation_data[p][method] = params


        remote_version = self.get('/public/utils/version').data
        if remote_version != VERSION:
            self._logger.warning('The remote server is running the API with version {}, but the client expected {}. They may not be compatible.'.format(remote_version, VERSION))

    def validate_parameter(self, endpoint, filter_type, value, method=HTTPMethods.GET):
        """
        Validate payload to send to the REST API based on info fetched from the API documentation

        Args:
            endpoint (str): endpoint of the REST API
            filter_type (str): the parameter sent for the request
            value: the value sent for the request
            method (HTTPMethods): request method, either HTTPMethods.GET or HTTPMethods.POST. Defaults to HTTPMethods.GET
        Raises
            AttributeError: if validation is not passed

        """
        endpoint_data = self.endpoint_validation_data[endpoint][method]
        if filter_type in endpoint_data:
            if endpoint_data[filter_type] == 'string' and isinstance(value, str):
                return
            elif endpoint_data[filter_type] == 'boolean' and isinstance(value, bool):
                return
            elif endpoint_data[filter_type] == 'number' and isinstance(value, (int, float)):
                return

        raise AttributeError('{}={} is not a valid parameter for endpoint {}'.format(filter_type, value, endpoint))

    def api_endpoint_docs(self, endpoint):
        """
        Returns the documentation available for a given REST API endpoint

        Args:
            endpoint (str): endpoint of the REST API

        Returns:
            dict: documentation for the endpoint parsed from YAML docs
        """
        return self.api_specs['paths'][endpoint]

    def get_api_endpoints(self):
        """
        Get a list of available endpoints

        Returns:
            list: available endpoints
        """
        return self.api_specs['paths'].keys()

    def close(self):
        """
        Close connection to the REST API
        """
        self.session.close()

    def ping(self):
        """
        Pings the API as a live check
        Returns:
            bool: True if pinging the raw response as a ``str`` if the API has a non standard name
        """
        response = self.get('/public/utils/ping')
        if response.data=='pong':
            return True
        elif response.data:
            return response.data
        return False

示例#56

0

显示文件

文件： fetcher_internal.py 项目： Gibble1/Penny-Dreadful-Tools

import os
import shutil
import stat
import urllib.request
import zipfile
from typing import Any, Dict, Optional

import requests
from cachecontrol import CacheControl, CacheControlAdapter
from cachecontrol.caches.file_cache import FileCache
from cachecontrol.heuristics import ExpiresAfter

from shared import configuration, perf
from shared.pd_exception import OperationalException

SESSION = CacheControl(requests.Session(),
                       cache=FileCache(configuration.get('web_cache')))
SESSION.mount('http://whatsinstandard.com',
              CacheControlAdapter(heuristic=ExpiresAfter(days=14)))


def unzip(url: str, path: str) -> str:
    location = '{scratch_dir}/zip'.format(
        scratch_dir=configuration.get('scratch_dir'))

    def remove_readonly(func, path, _):
        os.chmod(path, stat.S_IWRITE)
        func(path)

    shutil.rmtree(location, True, remove_readonly)
    os.mkdir(location)
    store(url, '{location}/zip.zip'.format(location=location))

示例#57

0

显示文件

文件： test_expires_heuristics.py 项目： sybrenstuvel/cachecontrol

 def test_expires_heuristic_arg(self):
     sess = Session()
     cached_sess = CacheControl(sess, heuristic=Mock())
     assert cached_sess

示例#58

0

显示文件

文件： test_expires_heuristics.py 项目： sybrenstuvel/cachecontrol

 def setup(self):
     self.sess = Session()
     self.cached_sess = CacheControl(self.sess, heuristic=OneDayCache())

示例#59

0

显示文件

class PyPiRepository(Repository):

    CACHE_VERSION = parse_constraint("1.0.0b2")

    def __init__(self, url="https://pypi.org/", disable_cache=False, fallback=True):
        self._url = url
        self._disable_cache = disable_cache
        self._fallback = fallback

        release_cache_dir = Path(CACHE_DIR) / "cache" / "repositories" / "pypi"
        self._cache = CacheManager(
            {
                "default": "releases",
                "serializer": "json",
                "stores": {
                    "releases": {"driver": "file", "path": str(release_cache_dir)},
                    "packages": {"driver": "dict"},
                },
            }
        )

        self._cache_control_cache = FileCache(str(release_cache_dir / "_http"))
        self._session = CacheControl(session(), cache=self._cache_control_cache)
        self._inspector = Inspector()

        super(PyPiRepository, self).__init__()

        self._name = "PyPI"

    @property
    def url(self):  # type: () -> str
        return self._url

    @property
    def authenticated_url(self):  # type: () -> str
        return self._url

    def find_packages(
        self,
        name,  # type: str
        constraint=None,  # type: Union[VersionConstraint, str, None]
        extras=None,  # type: Union[list, None]
        allow_prereleases=False,  # type: bool
    ):  # type: (...) -> List[Package]
        """
        Find packages on the remote server.
        """
        if constraint is None:
            constraint = "*"

        if not isinstance(constraint, VersionConstraint):
            constraint = parse_constraint(constraint)

        if isinstance(constraint, VersionRange):
            if (
                constraint.max is not None
                and constraint.max.is_prerelease()
                or constraint.min is not None
                and constraint.min.is_prerelease()
            ):
                allow_prereleases = True

        try:
            info = self.get_package_info(name)
        except PackageNotFound:
            self._log(
                "No packages found for {} {}".format(name, str(constraint)),
                level="debug",
            )
            return []

        packages = []

        for version, release in info["releases"].items():
            if not release:
                # Bad release
                self._log(
                    "No release information found for {}-{}, skipping".format(
                        name, version
                    ),
                    level="debug",
                )
                continue

            try:
                package = Package(name, version)
            except ParseVersionError:
                self._log(
                    'Unable to parse version "{}" for the {} package, skipping'.format(
                        version, name
                    ),
                    level="debug",
                )
                continue

            if package.is_prerelease() and not allow_prereleases:
                continue

            if not constraint or (constraint and constraint.allows(package.version)):
                if extras is not None:
                    package.requires_extras = extras

                packages.append(package)

        self._log(
            "{} packages found for {} {}".format(len(packages), name, str(constraint)),
            level="debug",
        )

        return packages

    def package(
        self,
        name,  # type: str
        version,  # type: str
        extras=None,  # type: (Union[list, None])
    ):  # type: (...) -> Union[Package, None]
        if extras is None:
            extras = []

        release_info = self.get_release_info(name, version)
        package = Package(name, version, version)
        requires_dist = release_info["requires_dist"] or []
        for req in requires_dist:
            try:
                dependency = dependency_from_pep_508(req)
            except InvalidMarker:
                # Invalid marker
                # We strip the markers hoping for the best
                req = req.split(";")[0]

                dependency = dependency_from_pep_508(req)
            except ValueError:
                # Likely unable to parse constraint so we skip it
                self._log(
                    "Invalid constraint ({}) found in {}-{} dependencies, "
                    "skipping".format(req, package.name, package.version),
                    level="debug",
                )
                continue

            if dependency.in_extras:
                for extra in dependency.in_extras:
                    if extra not in package.extras:
                        package.extras[extra] = []

                    package.extras[extra].append(dependency)

            if not dependency.is_optional():
                package.requires.append(dependency)

        # Adding description
        package.description = release_info.get("summary", "")

        if release_info["requires_python"]:
            package.python_versions = release_info["requires_python"]

        if release_info["platform"]:
            package.platform = release_info["platform"]

        # Adding hashes information
        package.files = release_info["files"]

        # Activate extra dependencies
        for extra in extras:
            if extra in package.extras:
                for dep in package.extras[extra]:
                    dep.activate()

                package.requires += package.extras[extra]

        return package

    def search(self, query):
        results = []

        search = {"q": query}

        response = session().get(self._url + "search", params=search)
        content = parse(response.content, namespaceHTMLElements=False)
        for result in content.findall(".//*[@class='package-snippet']"):
            name = result.find("h3/*[@class='package-snippet__name']").text
            version = result.find("h3/*[@class='package-snippet__version']").text

            if not name or not version:
                continue

            description = result.find("p[@class='package-snippet__description']").text
            if not description:
                description = ""

            try:
                result = Package(name, version, description)
                result.description = to_str(description.strip())
                results.append(result)
            except ParseVersionError:
                self._log(
                    'Unable to parse version "{}" for the {} package, skipping'.format(
                        version, name
                    ),
                    level="debug",
                )

        return results

    def get_package_info(self, name):  # type: (str) -> dict
        """
        Return the package information given its name.

        The information is returned from the cache if it exists
        or retrieved from the remote server.
        """
        if self._disable_cache:
            return self._get_package_info(name)

        return self._cache.store("packages").remember_forever(
            name, lambda: self._get_package_info(name)
        )

    def _get_package_info(self, name):  # type: (str) -> dict
        data = self._get("pypi/{}/json".format(name))
        if data is None:
            raise PackageNotFound("Package [{}] not found.".format(name))

        return data

    def get_release_info(self, name, version):  # type: (str, str) -> dict
        """
        Return the release information given a package name and a version.

        The information is returned from the cache if it exists
        or retrieved from the remote server.
        """
        if self._disable_cache:
            return self._get_release_info(name, version)

        cached = self._cache.remember_forever(
            "{}:{}".format(name, version), lambda: self._get_release_info(name, version)
        )

        cache_version = cached.get("_cache_version", "0.0.0")
        if parse_constraint(cache_version) != self.CACHE_VERSION:
            # The cache must be updated
            self._log(
                "The cache for {} {} is outdated. Refreshing.".format(name, version),
                level="debug",
            )
            cached = self._get_release_info(name, version)

            self._cache.forever("{}:{}".format(name, version), cached)

        return cached

    def _get_release_info(self, name, version):  # type: (str, str) -> dict
        self._log("Getting info for {} ({}) from PyPI".format(name, version), "debug")

        json_data = self._get("pypi/{}/{}/json".format(name, version))
        if json_data is None:
            raise PackageNotFound("Package [{}] not found.".format(name))

        info = json_data["info"]
        data = {
            "name": info["name"],
            "version": info["version"],
            "summary": info["summary"],
            "platform": info["platform"],
            "requires_dist": info["requires_dist"],
            "requires_python": info["requires_python"],
            "files": [],
            "_cache_version": str(self.CACHE_VERSION),
        }

        try:
            version_info = json_data["releases"][version]
        except KeyError:
            version_info = []

        for file_info in version_info:
            data["files"].append(
                {
                    "file": file_info["filename"],
                    "hash": "sha256:" + file_info["digests"]["sha256"],
                }
            )

        if self._fallback and data["requires_dist"] is None:
            self._log("No dependencies found, downloading archives", level="debug")
            # No dependencies set (along with other information)
            # This might be due to actually no dependencies
            # or badly set metadata when uploading
            # So, we need to make sure there is actually no
            # dependencies by introspecting packages
            urls = defaultdict(list)
            for url in json_data["urls"]:
                # Only get sdist and wheels if they exist
                dist_type = url["packagetype"]
                if dist_type not in ["sdist", "bdist_wheel"]:
                    continue

                urls[dist_type].append(url["url"])

            if not urls:
                return data

            info = self._get_info_from_urls(urls)

            data["requires_dist"] = info["requires_dist"]

            if not data["requires_python"]:
                data["requires_python"] = info["requires_python"]

        return data

    def _get(self, endpoint):  # type: (str) -> Union[dict, None]
        try:
            json_response = self._session.get(self._url + endpoint)
        except TooManyRedirects:
            # Cache control redirect loop.
            # We try to remove the cache and try again
            self._cache_control_cache.delete(self._url + endpoint)
            json_response = self._session.get(self._url + endpoint)

        if json_response.status_code == 404:
            return None

        json_data = json_response.json()

        return json_data

    def _get_info_from_urls(
        self, urls
    ):  # type: (Dict[str, List[str]]) -> Dict[str, Union[str, List, None]]
        # Checking wheels first as they are more likely to hold
        # the necessary information
        if "bdist_wheel" in urls:
            # Check fo a universal wheel
            wheels = urls["bdist_wheel"]

            universal_wheel = None
            universal_python2_wheel = None
            universal_python3_wheel = None
            platform_specific_wheels = []
            for wheel in wheels:
                link = Link(wheel)
                m = wheel_file_re.match(link.filename)
                if not m:
                    continue

                pyver = m.group("pyver")
                abi = m.group("abi")
                plat = m.group("plat")
                if abi == "none" and plat == "any":
                    # Universal wheel
                    if pyver == "py2.py3":
                        # Any Python
                        universal_wheel = wheel
                    elif pyver == "py2":
                        universal_python2_wheel = wheel
                    else:
                        universal_python3_wheel = wheel
                else:
                    platform_specific_wheels.append(wheel)

            if universal_wheel is not None:
                return self._get_info_from_wheel(universal_wheel)

            info = {}
            if universal_python2_wheel and universal_python3_wheel:
                info = self._get_info_from_wheel(universal_python2_wheel)

                py3_info = self._get_info_from_wheel(universal_python3_wheel)
                if py3_info["requires_dist"]:
                    if not info["requires_dist"]:
                        info["requires_dist"] = py3_info["requires_dist"]

                        return info

                    py2_requires_dist = set(
                        dependency_from_pep_508(r).to_pep_508()
                        for r in info["requires_dist"]
                    )
                    py3_requires_dist = set(
                        dependency_from_pep_508(r).to_pep_508()
                        for r in py3_info["requires_dist"]
                    )
                    base_requires_dist = py2_requires_dist & py3_requires_dist
                    py2_only_requires_dist = py2_requires_dist - py3_requires_dist
                    py3_only_requires_dist = py3_requires_dist - py2_requires_dist

                    # Normalizing requires_dist
                    requires_dist = list(base_requires_dist)
                    for requirement in py2_only_requires_dist:
                        dep = dependency_from_pep_508(requirement)
                        dep.marker = dep.marker.intersect(
                            parse_marker("python_version == '2.7'")
                        )
                        requires_dist.append(dep.to_pep_508())

                    for requirement in py3_only_requires_dist:
                        dep = dependency_from_pep_508(requirement)
                        dep.marker = dep.marker.intersect(
                            parse_marker("python_version >= '3'")
                        )
                        requires_dist.append(dep.to_pep_508())

                    info["requires_dist"] = sorted(list(set(requires_dist)))

            if info:
                return info

            # Prefer non platform specific wheels
            if universal_python3_wheel:
                return self._get_info_from_wheel(universal_python3_wheel)

            if universal_python2_wheel:
                return self._get_info_from_wheel(universal_python2_wheel)

            if platform_specific_wheels and "sdist" not in urls:
                # Pick the first wheel available and hope for the best
                return self._get_info_from_wheel(platform_specific_wheels[0])

        return self._get_info_from_sdist(urls["sdist"][0])

    def _get_info_from_wheel(
        self, url
    ):  # type: (str) -> Dict[str, Union[str, List, None]]
        self._log(
            "Downloading wheel: {}".format(urlparse.urlparse(url).path.rsplit("/")[-1]),
            level="debug",
        )

        filename = os.path.basename(urlparse.urlparse(url).path.rsplit("/")[-1])

        with temporary_directory() as temp_dir:
            filepath = Path(temp_dir) / filename
            self._download(url, str(filepath))

            return self._inspector.inspect_wheel(filepath)

    def _get_info_from_sdist(
        self, url
    ):  # type: (str) -> Dict[str, Union[str, List, None]]
        self._log(
            "Downloading sdist: {}".format(urlparse.urlparse(url).path.rsplit("/")[-1]),
            level="debug",
        )

        filename = os.path.basename(urlparse.urlparse(url).path)

        with temporary_directory() as temp_dir:
            filepath = Path(temp_dir) / filename
            self._download(url, str(filepath))

            return self._inspector.inspect_sdist(filepath)

    def _download(self, url, dest):  # type: (str, str) -> None
        r = get(url, stream=True)
        r.raise_for_status()

        with open(dest, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024):
                if chunk:
                    f.write(chunk)

    def _log(self, msg, level="info"):
        getattr(logger, level)("<comment>{}:</comment> {}".format(self._name, msg))

示例#60

0

显示文件

文件： get_book_urls.py 项目： sgraaf/Replicate-Toronto-BookCorpus

def main():
    # create dirs
    root_dir = Path(__file__).resolve().parents[1]
    dump_dir = root_dir / 'dump'
    mkdirs(dump_dir)

    # determine search_urls (should be roughly 0.9B words in total)
    search_urls = [
        f'https://www.smashwords.com/books/category/1/downloads/0/free/medium/{i}'
        for i in range(0, 30000 + 1, 20)
    ]

    # get headers (user-agents)
    headers = get_headers(root_dir / 'user-agents.txt')

    # initialize cache-controlled session
    session = CacheControl(Session())

    with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
        # get/write book_page_urls
        book_page_urls = []

        with open(dump_dir / 'book_page_urls.txt', 'w') as f:
            for nb_retry in count(1):
                # break if all search_urls successful
                if not search_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    print(
                        f'Could not get {len(search_urls)} search pages after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed searches (for future retries)
                failed_search_urls = []

                # get the search_responses
                search_responses = list(
                    tqdm(executor.map(get, search_urls, repeat(session),
                                      cycle(headers)),
                         total=len(search_urls),
                         desc='Getting searches'))

                # dump the search_responses
                dump(search_responses, 'search_responses.pkl')

                for search_url, search_r in zip(search_urls, search_responses):
                    if search_r is not None:
                        if search_r.status_code == 200:
                            search_r.encoding = 'utf-8'
                            search_tree = html.fromstring(search_r.content)
                            search_tree.make_links_absolute(search_r.url)

                            try:
                                for book_page_url in search_tree.xpath(
                                        '//a[@class="library-title"]/@href'):
                                    book_page_urls.append(book_page_url)
                                    f.write(book_page_url + '\n')
                            except IndexError:
                                failed_search_urls.append(search_url)
                                print(f'Request failed for {search_url}')
                        else:
                            failed_search_urls.append(search_url)
                            print(
                                f'Request failed for {search_url}: status code [{search_r.status_code}]'
                            )

                search_urls = failed_search_urls

        # write book_download_urls.txt
        with open(root_dir / 'book_download_urls.txt', 'w') as f:
            for nb_retry in count(1):
                # break if all book_page_urls successful
                if not book_page_urls:
                    break

                # break if max number of retries exceeded
                if nb_retry > NB_RETRIES:
                    print(
                        f'Could not get {len(book_page_urls)} book pages after {NB_RETRIES} retries.'
                    )
                    break

                # maintain a list of failed book_pagees (for future retries)
                failed_book_page_urls = []

                # get the book_page_responses
                book_page_responses = list(
                    tqdm(executor.map(get, book_page_urls, repeat(session),
                                      cycle(headers)),
                         total=len(book_page_urls),
                         desc='Getting book pages'))

                # dump the book_page_responses
                dump(book_page_responses, 'book_page_responses.pkl')

                for book_page_url, book_page_r in zip(book_page_urls,
                                                      book_page_responses):
                    if book_page_r is not None:
                        if book_page_r.status_code == 200:
                            book_page_r.encoding = 'utf-8'
                            book_page_tree = html.fromstring(
                                book_page_r.content)

                            try:
                                # get relevant data
                                script_text = book_page_tree.xpath(
                                    '//div[@id="contentArea"]/script/text()'
                                )[0]
                                _json = json.loads(
                                    script_text.split(
                                        'window.angularData.book = ')[1].split(
                                            '};')[0] + '}')
                                try:
                                    language = _json['language']['name']

                                    if language == 'English':
                                        formats = _json['formats']

                                        if 'TXT' in formats:
                                            f.write(
                                                book_page_tree.xpath(
                                                    '//a[@title="Plain text; contains no formatting"]/@href'
                                                )[0] + '\n')
                                        else:
                                            continue
                                except KeyError:
                                    continue
                            except IndexError:
                                failed_book_page_urls.append(book_page_url)
                                print(f'Request failed for {book_page_url}')
                        else:
                            failed_book_page_urls.append(book_page_url)
                            print(
                                f'Request failed for {book_page_url}: status code [{book_page_r.status_code}]'
                            )

                book_page_urls = failed_book_page_urls