def retrieve_response(self, spider, request): """ Return response if present in cache, or None otherwise. """ key = self._get_key(spider, request) epoch = request.meta.get('epoch') # guaranteed to be True or datetime s3_key = self._get_s3_key(key, epoch) if not s3_key: return log.msg('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url)) try: data_string = s3_key.get_contents_as_string() except boto.exception.S3ResponseError as e: # See store_response for error descriptions raise e finally: s3_key.close() data = pickle.loads(data_string) metadata = data['metadata'] request_headers = data['request_headers'] request_body = data['request_body'] response_headers = data['response_headers'] response_body = data['response_body'] url = metadata['response_url'] status = metadata.get('status') Response = responsetypes.from_args(headers=response_headers, url=url) return Response(url=url, headers=response_headers, status=status, body=response_body)
def test_from_args(self): # TODO: add more tests that check precedence between the different arguments mappings = [ ({ 'url': 'http://www.example.com/data.csv' }, TextResponse), # headers takes precedence over url ({ 'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/' }, HtmlResponse), ({ 'headers': Headers({ 'Content-Disposition': ['attachment; filename="data.xml.gz"'] }), 'url': 'http://www.example.com/page/' }, Response), ] for source, cls in mappings: retcls = responsetypes.from_args(**source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def _is_bzip2(self, response): try: body = bz2.decompress(response.body) except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls)
def _is_gzip(self, response): archive = StringIO(response.body) try: body = gzip.GzipFile(fileobj=archive).read() except IOError: return respcls = responsetypes.from_args(body=body) return response.replace(body=body, cls=respcls)
def retrieve_response(self, spider, request): data = self._read_data(spider, request) if data is None: return # not cached url = data['url'] status = data['status'] headers = Headers(data['headers']) body = data['body'] respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def _is_zip(self, response): archive = StringIO(response.body) try: zip_file = zipfile.ZipFile(archive) except zipfile.BadZipfile: return namelist = zip_file.namelist() body = zip_file.read(namelist[0]) respcls = responsetypes.from_args(filename=namelist[0], body=body) return response.replace(body=body, cls=respcls)
def _is_tar(self, response): archive = StringIO(response.body) try: tar_file = tarfile.open(name=mktemp(), fileobj=archive) except tarfile.ReadError: return body = tar_file.extractfile(tar_file.members[0]).read() respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body) return response.replace(body=body, cls=respcls)
def test_from_args(self): # TODO: add more tests that check precedence between the different arguments mappings = [ ({'url': 'http://www.example.com/data.csv'}, TextResponse), # headers takes precedence over url ({'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/'}, HtmlResponse), ({'headers': Headers({'Content-Disposition': ['attachment; filename="data.xml.gz"']}), 'url': 'http://www.example.com/page/'}, Response), ] for source, cls in mappings: retcls = responsetypes.from_args(**source) assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() url = metadata.get('response_url') status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def retrieve_response(self, spider, request): """Return response if present in cache, or None otherwise.""" metadata = self._read_meta(spider, request) if metadata is None: return # not cached rpath = self._get_request_path(spider, request) with open(join(rpath, 'response_body'), 'rb') as f: body = f.read() with open(join(rpath, 'response_headers'), 'rb') as f: rawheaders = f.read() # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11 url = metadata.get('response_url') or metadata['url'] status = metadata['status'] headers = Headers(headers_raw_to_dict(rawheaders)) respcls = responsetypes.from_args(headers=headers, url=url) response = respcls(url=url, headers=headers, status=status, body=body) return response
def process_response(self, request, response, spider): if isinstance(response, Response): content_encoding = response.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() decoded_body = self._decode(response.body, encoding.lower()) respcls = responsetypes.from_args(headers=response.headers, \ url=response.url) kwargs = dict(cls=respcls, body=decoded_body) if issubclass(respcls, TextResponse): # force recalculating the encoding until we make sure the # responsetypes guessing is reliable kwargs['encoding'] = None response = response.replace(**kwargs) if not content_encoding: del response.headers['Content-Encoding'] return response
def _all_in_one_read_download_file(request, spider): filepath = url2pathname(request.url.split("file://")[1]) with open(filepath) as f: body = f.read() respcls = responsetypes.from_args(filename=filepath, body=body) return respcls(url=request.url, body=body)
def _build_response(self, body): status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body)
def download_request(self, request, spider): filepath = file_uri_to_path(request.url) body = open(filepath, 'rb').read() respcls = responsetypes.from_args(filename=filepath, body=body) return respcls(url=request.url, body=body)
def _build_response(self, body, request): request.meta['download_latency'] = self.headers_time-self.start_time status = int(self.status) headers = Headers(self.response_headers) respcls = responsetypes.from_args(headers=headers, url=self.url) return respcls(url=self.url, status=status, headers=headers, body=body)