Python urlparse_cached примеры использования

Язык программирования: Python

Пространство имен/Пакет: ants.utils.httpobj

Метод/Функция: urlparse_cached

Примеров на hotexamples.com: 15

Python urlparse_cached - 15 примеров найдено. Это лучшие примеры Python кода для ants.utils.httpobj.urlparse_cached, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

    def add_cookie_header(self, request):
        wreq = WrappedRequest(request)
        self.policy._now = self.jar._now = int(time.time())

        # the cookiejar implementation iterates through all domains
        # instead we restrict to potential matches on the domain
        req_host = urlparse_cached(request).hostname
        if not req_host:
            return

        if not IPV4_RE.search(req_host):
            hosts = potential_domain_matches(req_host)
            if req_host.find(".") == -1:
                hosts += req_host + ".local"
        else:
            hosts = [req_host]

        cookies = []
        for host in hosts:
            if host in self.jar._cookies:
                cookies += self.jar._cookies_for_domain(host, wreq)

        attrs = self.jar._cookie_attrs(cookies)
        if attrs:
            if not wreq.has_header("Cookie"):
                wreq.add_unredirected_header("Cookie", "; ".join(attrs))

        self.processed += 1
        if self.processed % self.check_expired_frequency == 0:
            # This is still quite inefficient for large number of cookies
            self.jar.clear_expired_cookies()

Пример #2

Показать файл

Файл: webclient.py Проект: zymITsky/ants

 def _set_connection_attributes(self, request):
     parsed = urlparse_cached(request)
     self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed)
     proxy = request.meta.get('proxy')
     if proxy:
         self.scheme, _, self.host, self.port, _ = _parse(proxy)
         self.path = self.url

Пример #3

Показать файл

 def should_follow(self, request, spider):
     if not hasattr(self, 'host_regex'):
         self.spider_opened(spider)
     regex = self.host_regex
     # hostname can be None for wrong urls (like javascript links)
     host = urlparse_cached(request).hostname or ''
     return bool(regex.search(host))

Пример #4

Показать файл

 def should_cache_request(self, request):
     if urlparse_cached(request).scheme in self.ignore_schemes:
         return False
     cc = self._parse_cachecontrol(request)
     # obey user-agent directive "Cache-Control: no-store"
     if 'no-store' in cc:
         return False
     # Any other is eligible for caching
     return True

Пример #5

Показать файл

Файл: __init__.py Проект: zymITsky/ants

    def _get_slot_key(self, request, spider):
        if 'download_slot' in request.meta:
            return request.meta['download_slot']

        key = urlparse_cached(request).hostname or ''
        if self.ip_concurrency:
            key = dnscache.get(key, key)

        return key

Пример #6

Показать файл

Файл: __init__.py Проект: zymITsky/ants

 def download_request(self, request, spider):
     scheme = urlparse_cached(request).scheme
     try:
         handler = self._handlers[scheme].download_request
     except KeyError:
         msg = self._notconfigured.get(scheme, \
                                       'no handler available for that scheme')
         raise NotSupported("Unsupported URL scheme '%s': %s" %
                            (scheme, msg))
     return handler(request, spider)

Пример #7

Показать файл

 def robot_parser(self, request, spider):
     url = urlparse_cached(request)
     netloc = url.netloc
     if netloc not in self._parsers:
         self._parsers[netloc] = None
         robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc)
         robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY)
         dfd = self.crawler.engine.download(robotsreq, spider)
         dfd.addCallback(self._parse_robots)
         self._spider_netlocs.add(netloc)
     return self._parsers[netloc]

Пример #8

Показать файл

 def download_request(self, request, spider):
     p = urlparse_cached(request)
     scheme = 'https' if request.meta.get('is_secure') else 'http'
     bucket = p.hostname
     path = p.path + '?' + p.query if p.query else p.path
     url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path)
     signed_headers = self.conn.make_request(
             method=request.method,
             bucket=bucket,
             key=p.path,
             query_args=p.query,
             headers=request.headers,
             data=request.body)
     httpreq = request.replace(url=url, headers=signed_headers)
     return self._download_http(httpreq, spider)

Пример #9

Показать файл

Файл: request.py Проект: zymITsky/ants

def request_httprepr(request):
    """Return the raw HTTP representation (as string) of the given request.
    This is provided only for reference since it's not the actual stream of
    bytes that will be send when performing the request (that's controlled
    by Twisted).
    """
    parsed = urlparse_cached(request)
    path = urlunparse(('', '', parsed.path
                       or '/', parsed.params, parsed.query, ''))
    s = "%s %s HTTP/1.1\r\n" % (request.method, path)
    s += "Host: %s\r\n" % parsed.hostname
    if request.headers:
        s += request.headers.to_string() + "\r\n"
    s += "\r\n"
    s += request.body
    return s

Пример #10

Показать файл

 def process_spider_output(self, response, result, spider):
     for x in result:
         if isinstance(x, Request):
             if x.dont_filter or self.should_follow(x, spider):
                 yield x
             else:
                 domain = urlparse_cached(x).hostname
                 if domain and domain not in self.domains_seen:
                     self.domains_seen.add(domain)
                     log.spider_log("Filtered offsite request to " +
                                    str(domain) + ":" + x.url,
                                    level=log.DEBUG,
                                    spider=spider)
                     self.stats.inc_value('offsite/domains', spider=spider)
                 self.stats.inc_value('offsite/filtered', spider=spider)
         else:
             yield x

Пример #11

Показать файл

 def should_cache_request(self, request):
     return urlparse_cached(request).scheme not in self.ignore_schemes

Пример #12

Показать файл

 def get_origin_req_host(self):
     return urlparse_cached(self.request).hostname

Пример #13

Показать файл

 def get_type(self):
     return urlparse_cached(self.request).scheme

Пример #14

Показать файл

 def get_host(self):
     return urlparse_cached(self.request).netloc

Пример #15

Показать файл

 def _parse_robots(self, response):
     rp = robotparser.RobotFileParser(response.url)
     rp.parse(response.body.splitlines())
     self._parsers[urlparse_cached(response).netloc] = rp