Exemplo n.º 1
0
 def session(self):
     if self._session is None:
         self._session = real_requests.Session()
         if CacheControlAdapter:
             adapter = CacheControlAdapter(cache=FileCache(".webcache"))
             self._session.mount("http://", adapter)
             self._session.mount("https://", adapter)
             print("Caching to .webcache")
     return self._session
Exemplo n.º 2
0
    def get_reader(self):
        sess = CacheControl(requests.Session(),
                            cache=FileCache(tempfile.gettempdir()))
        req = sess.get(self.file)

        # if the response is not 200, an exception will be raised
        req.raise_for_status()

        return io.BufferedReader(io.BytesIO(req.content))
Exemplo n.º 3
0
def get_cached_session(caching=True):
    if not caching:
        return requests.Session()
    return requests.Session(
    )  # For some reason, in concurrent environments CacheControl works quite bad.

    CACHE_DIR = 'web_cache'
    return CacheControl(requests.Session(),
                        cache=FileCache(CACHE_DIR),
                        heuristic=_LastModifiedNoDate(require_date=False))
Exemplo n.º 4
0
    def __init__(self, headers=None, cookies=None, cache_name=None, delay=1, expire_hours=12, as_string=False):
        '''
        Base class for common scraping tasks
        Args:
            headers: dict of headers
            cookies: cookiejar object
            cache_name: should be full path
            delay: int (be polite!!!)
            expire_hours: int - default 4
            as_string: get string rather than parsed json
        '''
        logging.getLogger(__name__).addHandler(logging.NullHandler())

        if not cookies:
            try:
                import cookielib
                cookies = cookielib.MozillaCookieJar()
            except (NameError, ImportError) as e:
                try:
                    import http.cookiejar
                    cookies = http.cookiejar.MozillaCookieJar()
                except Exception as e:
                    pass

        _s = requests.Session()
        _s.cookies = cookies

        if headers:
            _s.headers.update(headers)
        else:
            _s.headers.update({'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'})

        if cache_name:
            if not '/' in cache_name:
                cache_name = os.path.join('/tmp', cache_name)
            try:
                from cachecontrol import CacheControlAdapter
                from cachecontrol.heuristics import ExpiresAfter
                from cachecontrol.caches import FileCache
                _s.mount('http://', CacheControlAdapter(cache=FileCache(cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=expire_hours)))
            except ImportError as e:
                try:
                    import requests_cache
                    requests_cache.install_cache(cache_name)
                except:
                    pass

        self.s = _s
        self.urls = []
        self.as_string = as_string

        if delay > 0:
            self.delay = delay
        else:
            self.delay = None
Exemplo n.º 5
0
 def __init__(self, destination, staging, s3_url, dry_run, cache):
     self.destination = destination
     self.staging = staging
     self.s3_url = s3_url
     self.dry_run = dry_run
     if cache:
         self.info(f"Using cache {cache}")
         self.fetcher = CacheControl(requests.session(), cache=FileCache(cache))
     else:
         self.info(f"Making uncached requests")
         self.fetcher = requests
Exemplo n.º 6
0
def amalgama_lyrics(artist, song):
    url = amalgama.get_url(artist, song)
    try:
        cached_sess = CacheControl(sess, cache=FileCache('.amalgama'))
        response = cached_sess.get(url)
        response.raise_for_status()
    except requests.exceptions.HTTPError:
        print(f'{artist}-{song} not found in amalgama {url}')
        return None
    text = amalgama.get_html(response.text)
    return text
    def __init__(self,
                 ctx,
                 schemagraph=None,
                 foreign_properties=None,
                 idx=None,
                 cache=None,
                 session=None):
        # type: (Loader.ContextType, rdflib.Graph, Set[unicode], Dict[unicode, Union[List, Dict[unicode, Any], unicode]], Dict[unicode, Any], requests.sessions.Session) -> None
        normalize = lambda url: urlparse.urlsplit(url).geturl()
        if idx is not None:
            self.idx = idx
        else:
            self.idx = NormDict(normalize)

        self.ctx = {}  # type: Loader.ContextType
        if schemagraph is not None:
            self.graph = schemagraph
        else:
            self.graph = rdflib.graph.Graph()

        if foreign_properties is not None:
            self.foreign_properties = foreign_properties
        else:
            self.foreign_properties = set()

        if cache is not None:
            self.cache = cache
        else:
            self.cache = {}

        self.session = None  # type: requests.sessions.Session
        if session is not None:
            self.session = session
        else:
            self.session = CacheControl(requests.Session(),
                                        cache=FileCache(
                                            os.path.join(
                                                os.environ["HOME"], ".cache",
                                                "salad")))

        self.url_fields = None  # type: Set[unicode]
        self.scoped_ref_fields = None  # type: Dict[unicode, int]
        self.vocab_fields = None  # type: Set[unicode]
        self.identifiers = None  # type: Set[unicode]
        self.identity_links = None  # type: Set[unicode]
        self.standalone = None  # type: Set[unicode]
        self.nolinkcheck = None  # type: Set[unicode]
        self.vocab = {}  # type: Dict[unicode, unicode]
        self.rvocab = {}  # type: Dict[unicode, unicode]
        self.idmap = None  # type: Dict[unicode, Any]
        self.mapPredicate = None  # type: Dict[unicode, unicode]
        self.type_dsl_fields = None  # type: Set[unicode]

        self.add_context(ctx)
Exemplo n.º 8
0
 def __init__(self, destination: Path, staging: Path, s3_url: str, dry_run: bool, is_nightly_enabled: bool,
              cache: Optional[Path]):
     self.destination = destination
     self.staging = staging
     self.s3_url = s3_url
     self.dry_run = dry_run
     self.is_nightly_enabled = is_nightly_enabled
     if cache:
         self.info(f"Using cache {cache}")
         self.fetcher = CacheControl(requests.session(), cache=FileCache(cache))
     else:
         self.info("Making uncached requests")
         self.fetcher = requests
Exemplo n.º 9
0
def get_public_key(token):
    """
    Because Google's public keys are only changed infrequently (on the order of once per day),
    we can take advantage of caching to reduce latency and the potential for network errors.
    """
    sess = CacheControl(requests.Session(),
                        cache=FileCache('/tmp/firebase-certs-cache'))
    request = sess.get(_CERT_URL)
    ks = request.json()
    keys = []
    for k, v in ks.items():
        keys.append({"alg": "RS256", "kid": k, "pem": v})
    return search_for_key(token, keys)
Exemplo n.º 10
0
    def fetch(self):
        feed = None
        if InformantConfig().get_argv_clear_cache():
            ui.debug_print('Clearing cache')
            fs.clear_cachefile()
        if InformantConfig().get_argv_use_cache():
            ui.debug_print('Checking cache in {}'.format(
                InformantConfig().get_cachefile()))
            cachefile = InformantConfig().get_cachefile()
            os.umask(
                0o0002
            )  # unrestrict umask so we can cache with proper permissions
            try:
                session = CacheControl(requests.Session(),
                                       cache=FileCache(cachefile,
                                                       filemode=0o0664,
                                                       dirmode=0o0775))
                feed = feedparser.parse(session.get(self.url).content)
            except Exception as e:
                ui.err_print('Unable to read cache information: {}'.format(e))
                ui.debug_print('Falling back to fetching feed')
                feed = feedparser.parse(self.url)
        else:
            feed = feedparser.parse(self.url)

        if feed.bozo:
            e = feed.bozo_exception
            if isinstance(e, URLError):
                # most likely this is an internet issue (no connection)
                ui.warn_print('News could not be fetched for {}'.format(
                    self.name if self.name is not None else self.url))
                ui.debug_print('URLError: {}'.format(e.reason))
            else:
                # I think this is most likely to be a malformed feed
                ui.err_print('Encountered feed error: {}'.format(
                    feed.bozo_exception))
                ui.debug_print('bozo message: {}'.format(
                    feed.bozo_exception.getMessage()))
            # In either of these error cases we probably shouldn't return error
            # so the pacman hook won't hold up an operation.
            # Here return an empty set of entries in case only one of multiple
            # feeds failed to fetch
            try:
                feed = feedparser.util.FeedParserDict()
                feed.update({'entries': []})
            except Exception as e:
                ui.err_print('Unexpected error: {}'.format(e))
                sys.exit()

        return feed
Exemplo n.º 11
0
def main(argv):
    sess = CacheControl(requests.Session(),
                        cache=FileCache('.web_cache'))
    requests.get = sess.get
    resource_schema = tools.load(sys.argv[1])

    apply_all_tweaks(resource_schema)

    if len(argv) == 3 and argv[2].endswith('json'):
        tools.write(resource_schema, argv[1])
    else:
        print tools.print_(resource_schema)

    return 0
    def __init__(
        self,
        fetcher: Optional[Fetcher] = None,
        namespaces: Optional[Dict[str, str]] = None,
        schemas: Optional[Dict[str, str]] = None,
        fileuri: Optional[str] = None,
        copyfrom: Optional["LoadingOptions"] = None,
        original_doc: Optional[Any] = None,
    ) -> None:
        """Create a LoadingOptions object."""
        self.idx: Dict[str, Dict[str, Any]] = {}
        self.fileuri: Optional[str] = fileuri
        self.namespaces = namespaces
        self.schemas = schemas
        self.original_doc = original_doc
        if copyfrom is not None:
            self.idx = copyfrom.idx
            if fetcher is None:
                fetcher = copyfrom.fetcher
            if fileuri is None:
                self.fileuri = copyfrom.fileuri
            if namespaces is None:
                self.namespaces = copyfrom.namespaces
            if schemas is None:
                self.schemas = copyfrom.schemas

        if fetcher is None:
            import requests
            from cachecontrol.caches import FileCache
            from cachecontrol.wrapper import CacheControl

            root = pathlib.Path(os.environ.get("HOME", tempfile.gettempdir()))
            session = CacheControl(
                requests.Session(),
                cache=FileCache(root / ".cache" / "salad"),
            )
            self.fetcher: Fetcher = DefaultFetcher({}, session)
        else:
            self.fetcher = fetcher

        self.vocab = _vocab
        self.rvocab = _rvocab

        if namespaces is not None:
            self.vocab = self.vocab.copy()
            self.rvocab = self.rvocab.copy()
            for k, v in namespaces.items():
                self.vocab[k] = v
                self.rvocab[v] = k
Exemplo n.º 13
0
def requests_session() -> requests.Session:
    """Creates a Requests-Cache session object."""

    global _session

    if _session is not None:
        return _session

    cache_name = cache_directory('blender_cloud_http')
    log.info('Storing cache in %s' % cache_name)

    _session = cachecontrol.CacheControl(sess=requests.session(),
                                         cache=FileCache(cache_name))

    return _session
Exemplo n.º 14
0
def get_cached_session():
    sess = CacheControl(requests.Session(),
                        cache=FileCache(CACHE_DIR),
                        heuristic=LastModifiedNoDate(require_date=False))

    original_get = sess.get

    def wrapped_get(*args, **kwargs):
        try:
            return original_get(*args, **kwargs)
        except (OSError, IOError) as e:
            return requests.get(*args, **kwargs)

    sess.get = wrapped_get
    return sess
Exemplo n.º 15
0
    def __init__(
            self,
            fetcher=None,  # type: Optional[Fetcher]
            namespaces=None,  # type: Optional[Dict[str, str]]
            schemas=None,  # type: Optional[Dict[str, str]]
            fileuri=None,  # type: Optional[str]
            copyfrom=None,  # type: Optional[LoadingOptions]
            original_doc=None,  # type: Optional[Any]
    ):  # type: (...) -> None
        self.idx = {}  # type: Dict[str, Dict[str, Any]]
        self.fileuri = fileuri  # type: Optional[str]
        self.namespaces = namespaces
        self.schemas = schemas
        self.original_doc = original_doc
        if copyfrom is not None:
            self.idx = copyfrom.idx
            if fetcher is None:
                fetcher = copyfrom.fetcher
            if fileuri is None:
                self.fileuri = copyfrom.fileuri
            if namespaces is None:
                self.namespaces = copyfrom.namespaces
            if schemas is None:
                self.schemas = copyfrom.schemas

        if fetcher is None:
            import requests
            from cachecontrol.caches import FileCache
            from cachecontrol.wrapper import CacheControl

            root = pathlib.Path(os.environ.get("HOME", tempfile.gettempdir()))
            session = CacheControl(
                requests.Session(),
                cache=FileCache(root / ".cache" / "salad"),
            )
            self.fetcher: Fetcher = DefaultFetcher({}, session)
        else:
            self.fetcher = fetcher

        self.vocab = _vocab
        self.rvocab = _rvocab

        if namespaces is not None:
            self.vocab = self.vocab.copy()
            self.rvocab = self.rvocab.copy()
            for k, v in namespaces.items():
                self.vocab[k] = v
                self.rvocab[v] = k
Exemplo n.º 16
0
def make_session(scraper):
    """ Instantiate a session with the desired configuration parameters,
    including the cache policy. """
    cache_path = os.path.join(scraper.config.data_path, 'cache')
    cache_policy = scraper.config.cache_policy
    cache_policy = cache_policy.lower().strip()
    session = ScraperSession()
    session.scraper = scraper
    session.cache_policy = cache_policy

    adapter = CacheControlAdapter(FileCache(cache_path),
                                  cache_etags=True,
                                  controller_class=PolicyCacheController)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session
Exemplo n.º 17
0
 def __init__(
     self,
     config: Config | None = None,
     io: IO | None = None,
     cache_id: str | None = None,
     disable_cache: bool = False,
 ) -> None:
     self._config = config or Config.create()
     self._io = io
     self._sessions_for_netloc: dict[str, requests.Session] = {}
     self._credentials: dict[str, HTTPAuthCredential] = {}
     self._certs: dict[str, dict[str, Path | None]] = {}
     self._configured_repositories: dict[
         str, AuthenticatorRepositoryConfig] | None = None
     self._password_manager = PasswordManager(self._config)
     self._cache_control = (FileCache(
         str(REPOSITORY_CACHE_DIR / (cache_id or "_default_cache") /
             "_http")) if not disable_cache else None)
Exemplo n.º 18
0
    def fromParameters(
            cls, sessionFactory: Callable[[],
                                          requests.Session], cachePath: str,
            maxAgeDictionary: Mapping[str, int]) -> 'IntersphinxCache':
        """
        Construct an instance with the given parameters.

        @param sessionFactory: A zero-argument L{callable} that
            returns a L{requests.Session}.
        @param cachePath: Path of the cache directory.
        @param maxAgeDictionary: A mapping describing the maximum
            age of any cache entry.
        @see: L{parseMaxAge}
        """
        session = CacheControl(sessionFactory(),
                               cache=FileCache(cachePath),
                               heuristic=ExpiresAfter(**maxAgeDictionary))
        return cls(session)
Exemplo n.º 19
0
def _get_filehandle(filepath_or, *args, **kwargs):
    """Open file if `filepath_or` looks like a string/unicode/bytes, else
    pass through.
    """
    if _is_string_or_bytes(filepath_or):
        if requests.compat.urlparse(filepath_or).scheme in {'http', 'https'}:
            sess = CacheControl(requests.Session(),
                                cache=FileCache(gettempdir()))
            req = sess.get(filepath_or, **kwargs)

            # if the response is not 200, an exception will be raised
            req.raise_for_status()

            fh, own_fh = BytesIO(req.content), True
        else:
            fh, own_fh = open(filepath_or, *args, **kwargs), True
    else:
        fh, own_fh = filepath_or, False
    return fh, own_fh
Exemplo n.º 20
0
    def __init__(self, engine, **kwargs):
        self.id = hashlib.sha1(
            six.b('{0}:{1!r}'.format(clsname(self),
                                     kwargs.get('__init_args__',
                                                {})))).hexdigest()

        if engine:
            self.engine = engine
        else:
            self.engine = busbus.Engine()
        self.engine._register_provider(self)

        self._requests = requests.Session()

        # This requests session object, wrapped with CacheControl, is useful
        # for long-term storage of larger files, such as GTFS data.
        self._cached_requests = CacheControl(
            self._requests,
            cache=FileCache(self.engine.config['url_cache_dir']))
Exemplo n.º 21
0
def get_feed(feed_url):
    """
    Return feed parsed feed
    """
    requests_timeout = getattr(settings, 'FEED_TIMOUT', 1)

    cache_adapter = CacheControlAdapter(
        cache=FileCache('.web_cache'),
        heuristic=ExpiresAfter(hours=1),
    )

    session = requests.Session()
    session.mount('http://', cache_adapter)
    session.mount('https://', cache_adapter)

    show_exceptions = getattr(settings, 'DEBUG', True)

    feed_request = session.get(feed_url, timeout=requests_timeout)

    return feedparser.parse(feed_request.text)
Exemplo n.º 22
0
    def __init__(self, api_key=None, locale=None, anonymize=False,
                 exclude_episodes=False, user_agent=None, cache=None,
                 proxy_uri=None, verify_ssl=True, session=None):
        self.api_key = api_key or SHA1_KEY
        self.timestamp = time.mktime(datetime.date.today().timetuple())
        self.user_agent = user_agent or random.choice(USER_AGENTS)
        self.locale = locale or 'en_US'
        self.exclude_episodes = exclude_episodes
        self.caching_enabled = True if cache is True else False
        self.proxy_uri = proxy_uri or DEFAULT_PROXY_URI
        self.anonymize = anonymize
        self.verify_ssl = verify_ssl
        self.session = session or requests.Session()

        if self.caching_enabled:
            warnings.warn('caching will be removed in version 5.0.0 '
                          'due to not being thread safe')
            self.session = CacheControl(
                self.session, cache=FileCache('.imdbpie_cache')
            )
Exemplo n.º 23
0
    def fromParameters(cls, sessionFactory, cachePath, maxAgeDictionary):
        """
        Construct an instance with the given parameters.

        @param sessionFactory: A zero-argument L{callable} that
            returns a L{requests.Session}.

        @param cachePath: Path of the cache directory.
        @type cachePath: L{str}

        @param maxAgeDictionary: A dictionary describing the maximum
            age of any cache entry.
        @type maxAgeDictionary: L{dict}

        @see: L{parseMaxAge}
        """
        session = CacheControl(sessionFactory(),
                               cache=FileCache(cachePath),
                               heuristic=ExpiresAfter(**maxAgeDictionary))
        return cls(session)
Exemplo n.º 24
0
 def __init__(
     self,
     config: Config | None = None,
     io: IO | None = None,
     cache_id: str | None = None,
     disable_cache: bool = False,
 ) -> None:
     self._config = config or Config.create()
     self._io = io
     self._sessions_for_netloc: dict[str, requests.Session] = {}
     self._credentials: dict[str, HTTPAuthCredential] = {}
     self._certs: dict[str, RepositoryCertificateConfig] = {}
     self._configured_repositories: dict[
         str, AuthenticatorRepositoryConfig] | None = None
     self._password_manager = PasswordManager(self._config)
     self._cache_control = (FileCache(
         str(self._config.repository_cache_directory /
             (cache_id or "_default_cache") /
             "_http")) if not disable_cache else None)
     self.get_repository_config_for_url = functools.lru_cache(maxsize=None)(
         self._get_repository_config_for_url)
Exemplo n.º 25
0
def get_session():
    CACHE_FOLDER.mkdir(exist_ok=True)
    cache = FileCache(str(CACHE_FOLDER), forever=True)
    cache.set("foo", b"bar")
    assert cache.get("foo") == b"bar"
    session = RateLimitingSession()
    # session.headers.update({"x-api-key": "something-something-darkside"})
    session.mount(
        "https://www.metlink.org.nz/",
        CacheControlAdapter(heuristic=BetterExpiresAfter(days=7), cache=cache),
    )
    session.mount(
        METLINK_API_URL_PREFIX,
        CacheControlAdapter(heuristic=BetterExpiresAfter(days=1), cache=cache),
    )
    session.mount(
        METLINK_API_URL_PREFIX + "ServiceLocation/",
        CacheControlAdapter(heuristic=BetterExpiresAfter(seconds=90),
                            cache=cache),
    )
    return session
Exemplo n.º 26
0
class Settings:
    do_update_wikidata = True

    # Read also tags if a project doesn't use githubs releases
    read_tags = True

    normalize_repo_url = True

    blacklist_page = "User:Github-wiki-bot/Exceptions"
    whitelist_page = "User:Github-wiki-bot/Whitelist"
    blacklist: List[str] = []
    whitelist: List[str] = []
    sparql_file = "free_software_items.rq"

    # pywikibot is too stupid to cache the calendar model, so let's do this manually
    calendarmodel = pywikibot.Site().data_repository().calendarmodel()
    wikidata_repo = pywikibot.Site("wikidata", "wikidata").data_repository()

    repo_regex = re.compile(r"^[a-z]+://github.com/[^/]+/[^/]+/?$")

    cached_session: requests.Session = CacheControl(requests.Session(),
                                                    cache=FileCache("cache"))
Exemplo n.º 27
0
def downloadHttpFile(httpurl):
    # type: (Text) -> Text
    cache_session = None
    if "XDG_CACHE_HOME" in os.environ:
        directory = os.environ["XDG_CACHE_HOME"]
    elif "HOME" in os.environ:
        directory = os.environ["HOME"]
    else:
        directory = os.path.expanduser('~')

    cache_session = CacheControl(
        requests.Session(),
        cache=FileCache(
            os.path.join(directory, ".cache", "cwltool")))

    r = cache_session.get(httpurl, stream=True)
    with NamedTemporaryFile(mode='wb', delete=False) as f:
        for chunk in r.iter_content(chunk_size=16384):
            if chunk:  # filter out keep-alive new chunks
                f.write(chunk)
    r.close()
    return f.name
Exemplo n.º 28
0
    def __init__(self,
                 redis_connection=None,
                 fallback_cache_duration=5,
                 file_cache_directory=".webcache",
                 timeout=(0.5, 3),
                 *args,
                 **kwargs):
        super(CachedSession, self).__init__(*args, **kwargs)

        heuristic = ExpiresAfterIfNoCacheControl(
            seconds=fallback_cache_duration)
        cache = FileCache(file_cache_directory)

        if redis_connection:
            cache = RedisCache(redis_connection)

        adapter = CacheAdapterWithTimeout(heuristic=heuristic,
                                          cache=cache,
                                          timeout=timeout)

        self.mount("http://", adapter)
        self.mount("https://", adapter)
Exemplo n.º 29
0
    def open(self):
        global SESSION
        if SESSION is None:
            SESSION = CacheControl(Session(),
                                   cache=FileCache(SESSION_CACHE_PATH))

        try:
            self._response = SESSION.get(self.uri, headers=self.headers)
        except ConnectionError as e:
            raise LoaderException('request connection error: "%s"' % self.uri,
                                  cause=e)
        except Exception as e:
            raise LoaderException('request error: "%s"' % self.uri, cause=e)

        status = self._response.status_code
        if status == 404:
            self._response = None
            raise DocumentNotFoundException('document not found: "%s"' %
                                            self.uri)
        elif status != 200:
            self._response = None
            raise LoaderException('request error %d: "%s"' %
                                  (status, self.uri))
Exemplo n.º 30
0
    def __init__(self,
                 api_key=None,
                 locale=None,
                 anonymize=None,
                 exclude_episodes=None,
                 user_agent=None,
                 cache=None,
                 proxy_uri=None,
                 verify_ssl=None):
        self.api_key = api_key or SHA1_KEY
        self.timestamp = time.mktime(datetime.date.today().timetuple())
        self.user_agent = user_agent or random.choice(USER_AGENTS)
        self.locale = locale or 'en_US'
        self.exclude_episodes = True if exclude_episodes is True else False
        self.caching_enabled = True if cache is True else False
        self.proxy_uri = proxy_uri or DEFAULT_PROXY_URI
        self.anonymize = False or anonymize
        self.verify_ssl = True or verify_ssl
        self.session = requests

        if self.caching_enabled:
            self.session = CacheControl(requests.Session(),
                                        cache=FileCache('.imdbpie_cache'))