def add_cookie_header(self, request): wreq = WrappedRequest(request) self.policy._now = self.jar._now = int(time.time()) # the cookiejar implementation iterates through all domains # instead we restrict to potential matches on the domain req_host = urlparse_cached(request).hostname if not req_host: return if not IPV4_RE.search(req_host): hosts = potential_domain_matches(req_host) if req_host.find(".") == -1: hosts += req_host + ".local" else: hosts = [req_host] cookies = [] for host in hosts: if host in self.jar._cookies: cookies += self.jar._cookies_for_domain(host, wreq) attrs = self.jar._cookie_attrs(cookies) if attrs: if not wreq.has_header("Cookie"): wreq.add_unredirected_header("Cookie", "; ".join(attrs)) self.processed += 1 if self.processed % self.check_expired_frequency == 0: # This is still quite inefficient for large number of cookies self.jar.clear_expired_cookies()
def _set_connection_attributes(self, request): parsed = urlparse_cached(request) self.scheme, self.netloc, self.host, self.port, self.path = _parsed_url_args(parsed) proxy = request.meta.get('proxy') if proxy: self.scheme, _, self.host, self.port, _ = _parse(proxy) self.path = self.url
def should_follow(self, request, spider): if not hasattr(self, 'host_regex'): self.spider_opened(spider) regex = self.host_regex # hostname can be None for wrong urls (like javascript links) host = urlparse_cached(request).hostname or '' return bool(regex.search(host))
def should_cache_request(self, request): if urlparse_cached(request).scheme in self.ignore_schemes: return False cc = self._parse_cachecontrol(request) # obey user-agent directive "Cache-Control: no-store" if 'no-store' in cc: return False # Any other is eligible for caching return True
def _get_slot_key(self, request, spider): if 'download_slot' in request.meta: return request.meta['download_slot'] key = urlparse_cached(request).hostname or '' if self.ip_concurrency: key = dnscache.get(key, key) return key
def download_request(self, request, spider): scheme = urlparse_cached(request).scheme try: handler = self._handlers[scheme].download_request except KeyError: msg = self._notconfigured.get(scheme, \ 'no handler available for that scheme') raise NotSupported("Unsupported URL scheme '%s': %s" % (scheme, msg)) return handler(request, spider)
def robot_parser(self, request, spider): url = urlparse_cached(request) netloc = url.netloc if netloc not in self._parsers: self._parsers[netloc] = None robotsurl = "%s://%s/robots.txt" % (url.scheme, url.netloc) robotsreq = Request(robotsurl, priority=self.DOWNLOAD_PRIORITY) dfd = self.crawler.engine.download(robotsreq, spider) dfd.addCallback(self._parse_robots) self._spider_netlocs.add(netloc) return self._parsers[netloc]
def download_request(self, request, spider): p = urlparse_cached(request) scheme = 'https' if request.meta.get('is_secure') else 'http' bucket = p.hostname path = p.path + '?' + p.query if p.query else p.path url = '%s://%s.s3.amazonaws.com%s' % (scheme, bucket, path) signed_headers = self.conn.make_request( method=request.method, bucket=bucket, key=p.path, query_args=p.query, headers=request.headers, data=request.body) httpreq = request.replace(url=url, headers=signed_headers) return self._download_http(httpreq, spider)
def request_httprepr(request): """Return the raw HTTP representation (as string) of the given request. This is provided only for reference since it's not the actual stream of bytes that will be send when performing the request (that's controlled by Twisted). """ parsed = urlparse_cached(request) path = urlunparse(('', '', parsed.path or '/', parsed.params, parsed.query, '')) s = "%s %s HTTP/1.1\r\n" % (request.method, path) s += "Host: %s\r\n" % parsed.hostname if request.headers: s += request.headers.to_string() + "\r\n" s += "\r\n" s += request.body return s
def process_spider_output(self, response, result, spider): for x in result: if isinstance(x, Request): if x.dont_filter or self.should_follow(x, spider): yield x else: domain = urlparse_cached(x).hostname if domain and domain not in self.domains_seen: self.domains_seen.add(domain) log.spider_log("Filtered offsite request to " + str(domain) + ":" + x.url, level=log.DEBUG, spider=spider) self.stats.inc_value('offsite/domains', spider=spider) self.stats.inc_value('offsite/filtered', spider=spider) else: yield x
def should_cache_request(self, request): return urlparse_cached(request).scheme not in self.ignore_schemes
def get_origin_req_host(self): return urlparse_cached(self.request).hostname
def get_type(self): return urlparse_cached(self.request).scheme
def get_host(self): return urlparse_cached(self.request).netloc
def _parse_robots(self, response): rp = robotparser.RobotFileParser(response.url) rp.parse(response.body.splitlines()) self._parsers[urlparse_cached(response).netloc] = rp