def retrieve_response(self, spider, request):
        """
        Return response if present in cache, or None otherwise.
        """
        key = self._get_key(spider, request)

        epoch = request.meta.get('epoch') # guaranteed to be True or datetime
        s3_key = self._get_s3_key(key, epoch)

        if not s3_key:
            return

        log.msg('S3Storage (epoch => %s): retrieving response for %s.' % (epoch, request.url))
        try:
            data_string = s3_key.get_contents_as_string()
        except boto.exception.S3ResponseError as e:
            # See store_response for error descriptions
            raise e
        finally:
            s3_key.close()

        data = pickle.loads(data_string)

        metadata         = data['metadata']
        request_headers  = data['request_headers']
        request_body     = data['request_body']
        response_headers = data['response_headers']
        response_body    = data['response_body']

        url      = metadata['response_url']
        status   = metadata.get('status')
        Response = responsetypes.from_args(headers=response_headers, url=url)
        return Response(url=url, headers=response_headers, status=status, body=response_body)
Exemplo n.º 2
0
 def test_from_args(self):
     # TODO: add more tests that check precedence between the different arguments
     mappings = [
         ({
             'url': 'http://www.example.com/data.csv'
         }, TextResponse),
         # headers takes precedence over url
         ({
             'headers':
             Headers({'Content-Type': ['text/html; charset=utf-8']}),
             'url': 'http://www.example.com/item/'
         }, HtmlResponse),
         ({
             'headers':
             Headers({
                 'Content-Disposition':
                 ['attachment; filename="data.xml.gz"']
             }),
             'url':
             'http://www.example.com/page/'
         }, Response),
     ]
     for source, cls in mappings:
         retcls = responsetypes.from_args(**source)
         assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Exemplo n.º 3
0
    def _is_bzip2(self, response):
        try:
            body = bz2.decompress(response.body)
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)
Exemplo n.º 4
0
    def _is_gzip(self, response):
        archive = StringIO(response.body)
        try:
            body = gzip.GzipFile(fileobj=archive).read()
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)
Exemplo n.º 5
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return  # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Exemplo n.º 6
0
    def _is_zip(self, response):
        archive = StringIO(response.body)
        try:
            zip_file = zipfile.ZipFile(archive)
        except zipfile.BadZipfile:
            return

        namelist = zip_file.namelist()
        body = zip_file.read(namelist[0])
        respcls = responsetypes.from_args(filename=namelist[0], body=body)
        return response.replace(body=body, cls=respcls)
Exemplo n.º 7
0
    def _is_tar(self, response):
        archive = StringIO(response.body)
        try:
            tar_file = tarfile.open(name=mktemp(), fileobj=archive)
        except tarfile.ReadError:
            return

        body = tar_file.extractfile(tar_file.members[0]).read()
        respcls = responsetypes.from_args(filename=tar_file.members[0].name,
                                          body=body)
        return response.replace(body=body, cls=respcls)
Exemplo n.º 8
0
 def retrieve_response(self, spider, request):
     data = self._read_data(spider, request)
     if data is None:
         return # not cached
     url = data['url']
     status = data['status']
     headers = Headers(data['headers'])
     body = data['body']
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Exemplo n.º 9
0
    def test_from_args(self):
        # TODO: add more tests that check precedence between the different arguments
        mappings = [
            ({'url': 'http://www.example.com/data.csv'}, TextResponse),
            # headers takes precedence over url
            ({'headers': Headers({'Content-Type': ['text/html; charset=utf-8']}), 'url': 'http://www.example.com/item/'}, HtmlResponse),
            ({'headers': Headers({'Content-Disposition': ['attachment; filename="data.xml.gz"']}), 'url': 'http://www.example.com/page/'}, Response),


        ]
        for source, cls in mappings:
            retcls = responsetypes.from_args(**source)
            assert retcls is cls, "%s ==> %s != %s" % (source, retcls, cls)
Exemplo n.º 10
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     url = metadata.get('response_url')
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Exemplo n.º 11
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11
     url = metadata.get('response_url') or metadata['url']
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Exemplo n.º 12
0
 def retrieve_response(self, spider, request):
     """Return response if present in cache, or None otherwise."""
     metadata = self._read_meta(spider, request)
     if metadata is None:
         return  # not cached
     rpath = self._get_request_path(spider, request)
     with open(join(rpath, 'response_body'), 'rb') as f:
         body = f.read()
     with open(join(rpath, 'response_headers'), 'rb') as f:
         rawheaders = f.read()
     # We failback to metadata['url'] to support old generated caches. TODO: remove for Scrapy 0.11
     url = metadata.get('response_url') or metadata['url']
     status = metadata['status']
     headers = Headers(headers_raw_to_dict(rawheaders))
     respcls = responsetypes.from_args(headers=headers, url=url)
     response = respcls(url=url, headers=headers, status=status, body=body)
     return response
Exemplo n.º 13
0
    def process_response(self, request, response, spider):
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Exemplo n.º 14
0
    def process_response(self, request, response, spider):
        if isinstance(response, Response):
            content_encoding = response.headers.getlist('Content-Encoding')
            if content_encoding:
                encoding = content_encoding.pop()
                decoded_body = self._decode(response.body, encoding.lower())
                respcls = responsetypes.from_args(headers=response.headers, \
                    url=response.url)
                kwargs = dict(cls=respcls, body=decoded_body)
                if issubclass(respcls, TextResponse):
                    # force recalculating the encoding until we make sure the
                    # responsetypes guessing is reliable
                    kwargs['encoding'] = None
                response = response.replace(**kwargs)
                if not content_encoding:
                    del response.headers['Content-Encoding']

        return response
Exemplo n.º 15
0
def _all_in_one_read_download_file(request, spider):
    filepath = url2pathname(request.url.split("file://")[1])
    with open(filepath) as f:
        body = f.read()
    respcls = responsetypes.from_args(filename=filepath, body=body)
    return respcls(url=request.url, body=body)
Exemplo n.º 16
0
 def _build_response(self, body):
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self.url)
     return respcls(url=self.url, status=status, headers=headers, body=body)
Exemplo n.º 17
0
 def download_request(self, request, spider):
     filepath = file_uri_to_path(request.url)
     body = open(filepath, 'rb').read()
     respcls = responsetypes.from_args(filename=filepath, body=body)
     return respcls(url=request.url, body=body)
Exemplo n.º 18
0
 def _build_response(self, body, request):
     request.meta['download_latency'] = self.headers_time-self.start_time
     status = int(self.status)
     headers = Headers(self.response_headers)
     respcls = responsetypes.from_args(headers=headers, url=self.url)
     return respcls(url=self.url, status=status, headers=headers, body=body)
Exemplo n.º 19
0
 def download_request(self, request, spider):
     filepath = file_uri_to_path(request.url)
     body = open(filepath, 'rb').read()
     respcls = responsetypes.from_args(filename=filepath, body=body)
     return respcls(url=request.url, body=body)