예제 #1
0
 def store_response(self, spider, request, response):
     """Store the given response in the mongo."""
     key = request_fingerprint(request)
     response_headers = headers_dict_to_raw(response.headers)
     response_body = self._get_body(response.headers, response.body)
     request_headers = headers_dict_to_raw(request.headers)
     request_body = self._get_body(request.headers, request.body)
     stored_data = {
         'metadata': {
             'url': request.url,
             'method': request.method,
             'status': response.status,
             'response_url': response.url,
             'timestamp': time(),
         },
         'response_headers': response_headers,
         'response_body': response_body,
         'request_headers': request_headers,
         'request_body': request_body,
     }
     #print stored_data
     try:
         self.collection.insert({"_id": key, "value": stored_data})
     except Exception, e:
         print e.message
         pass
예제 #2
0
 def store_response(self, spider: TSpider, request: TRequest,
                    response: TResponse) -> None:
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     with self._open(os.path.join(rpath, "meta"), "wb") as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f:
         pickle.dump(metadata, f, protocol=2)
     with self._open(os.path.join(rpath, "response_headers"), "wb") as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, "response_body"), "wb") as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, "request_headers"), "wb") as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, "request_body"), "wb") as f:
         f.write(request.body)
예제 #3
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     # 将给定的响应存储在缓存中。
     rpath = self._get_request_path(spider, request)
     if not os.path.exists(rpath):
         os.makedirs(rpath)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     with self._open(os.path.join(rpath, 'meta'), 'wb') as f:
         f.write(to_bytes(repr(metadata)))
     with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f:
         pickle.dump(metadata, f, protocol=4)
     with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(response.headers))
     with self._open(os.path.join(rpath, 'response_body'), 'wb') as f:
         f.write(response.body)
     with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(request.headers))
     with self._open(os.path.join(rpath, 'request_body'), 'wb') as f:
         f.write(request.body)
예제 #4
0
def store_response(rpath, request, response):
    """
    Store the given response in the cache
    """
    if not exists(rpath):
        os.makedirs(rpath)
    metadata = {
        'url': request.url,
        'method': request.method,
        'status': response.status,
        'response_url': response.url,
        'timestamp': time(),
    }
    with open(join(rpath, 'meta'), 'wb') as f:
        f.write(repr(metadata))
    with open(join(rpath, 'pickled_meta'), 'wb') as f:
        pickle.dump(metadata, f, protocol=2)
    with open(join(rpath, 'response_headers'), 'wb') as f:
        f.write(headers_dict_to_raw(response.headers))
    with open(join(rpath, 'response_body'), 'wb') as f:
        f.write(response.body)
    with open(join(rpath, 'request_headers'), 'wb') as f:
        f.write(headers_dict_to_raw(request.headers))
    with open(join(rpath, 'request_body'), 'wb') as f:
        f.write(request.body)
예제 #5
0
파일: httpcache.py 프로젝트: st-li/rg_chn3
    def store_response(self, spider, request, response):
        """Store the given response in the mongo."""
        key = request_fingerprint(request)
        response_headers = headers_dict_to_raw(response.headers)
        response_body = self._get_body(response.headers, response.body)
        request_headers = headers_dict_to_raw(request.headers)
        request_body = self._get_body(request.headers, request.body)
        stored_data = {
            'metadata': {
                'url': request.url,
                'method': request.method,
                'status': response.status,
                'response_url': response.url,
                'timestamp': time(),
            },
            'response_headers': response_headers,
            'response_body': response_body,
            'request_headers': request_headers,
            'request_body': request_body,
        }
        #print stored_data

        collection_index = int(key, 16) % 1000
        collection_name = 'collection' + str(collection_index)
        collection = self.db[collection_name]

        try:
            collection.insert({"_id": key, "value": stored_data})
            print "-----------------Write cache %s------------------" % collection_name
        except Exception, e:
            print e.message
            pass
예제 #6
0
    def test_headers_dict_to_raw_wrong_values(self):
        dct = OrderedDict([
            (b'Content-type', 0),
        ])
        self.assertEqual(headers_dict_to_raw(dct), b'')

        dct = OrderedDict([(b'Content-type', 1), (b'Accept', [b'gzip'])])
        self.assertEqual(headers_dict_to_raw(dct), b'Accept: gzip')
예제 #7
0
파일: test_http.py 프로젝트: scrapy/w3lib
    def test_headers_dict_to_raw_wrong_values(self):
        dct: HeadersDictInput = OrderedDict([
            (b"Content-type", 0),
        ])
        self.assertEqual(headers_dict_to_raw(dct), b"")
        self.assertEqual(headers_dict_to_raw(dct), b"")

        dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])])
        self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
예제 #8
0
파일: test_http.py 프로젝트: scrapy/w3lib
    def test_headers_dict_to_raw_wrong_values(self):
        dct = OrderedDict([
            (b'Content-type', 0),
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b''
        )

        dct = OrderedDict([
            (b'Content-type', 1),
            (b'Accept', [b'gzip'])
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b'Accept: gzip'
        )
예제 #9
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     data = {
         '_id': self._inverse_url(request.url),
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
         'response_body': response.body_as_unicode(),
         'response_headers': headers_dict_to_raw(response.headers),
         'request_headers': headers_dict_to_raw(request.headers),
         'request_body': request.body,
         'encoding': response.encoding
     }
     _id = self._inverse_url(request.url)
     self.bucket.add(_id, 0, 0, json.dumps(data))
예제 #10
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     data = {
         '_id': self._inverse_url(request.url),
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
         'response_body': response.body_as_unicode(),
         'response_headers': headers_dict_to_raw(response.headers),
         'request_headers': headers_dict_to_raw(request.headers),
         'request_body': request.body,
         'encoding': response.encoding
     }
     _id = self._inverse_url(request.url)
     self.bucket.add(_id, 0, 0, json.dumps(data))
    def _update_cache(self, spider, request, response):
        key = self._get_request_key(spider, request)
        request_meta = {
            'headers': headers_dict_to_raw(request.headers),
            'method': request.method
        }

        response_meta = {
            'headers': headers_dict_to_raw(response.headers),
            'status': str(response.status
                          )  # this is needed to make all values of str type
        }
        try:
            query = """
                UPDATE http_cache_binary
                SET ts=%(ts)s,
                request_url=%(request_url)s,
                request_meta=%(request_meta)s,
                request_body=%(request_body)s,
                response_url=%(response_url)s,
                response_meta=%(response_meta)s,
                response_body=%(response_body)s
                WHERE hashkey=%(hashkey)s
            """
            data = {
                'hashkey': key,
                'ts': datetime.datetime.now(),
                'request_url': request.url,
                'request_meta': request_meta,
                'request_body': psycopg2.Binary(request.body),
                'response_url': response.url,
                'response_meta': response_meta,
                'response_body': psycopg2.Binary(response.body)
            }
            self.cursor.execute(query, data)
        except DatabaseError, e:
            self.db.rollback()
            err = "[HTTP Cache] Error: failed to update cache in database: %s" % str(
                e)
            logging.error(err)
            if not hasattr(spider, 'errors'):
                spider.errors = []
            spider.errors.append(
                'HTTP Cache failed. Please contact Yuri <*****@*****.**>'
            )
예제 #12
0
 def test_headers_dict_to_raw(self):
     dct = OrderedDict([
         (b'Content-type', b'text/html'),
         (b'Accept', b'gzip')
     ])
     self.assertEqual(
         headers_dict_to_raw(dct),
         b'Content-type: text/html\r\nAccept: gzip'
     )
    def _save_cache(self, spider, request, response):
        key = self._get_request_key(spider, request)
        request_meta = {
            'headers': headers_dict_to_raw(request.headers),
            'method': request.method
        }

        response_meta = {
            'headers': headers_dict_to_raw(response.headers),
            'status': str(response.status
                          )  # this is needed to make all values of str type
        }
        try:
            query = """
                INSERT INTO http_cache_binary
                (hashkey, ts, request_url, request_meta, request_body,
                response_url, response_meta, response_body)
                VALUES
                (%(hashkey)s, %(ts)s, %(request_url)s, %(request_meta)s, %(request_body)s,
                %(response_url)s, %(response_meta)s, %(response_body)s)
            """
            data = {
                'hashkey': key,
                'ts': datetime.datetime.now(),
                'request_url': request.url,
                'request_meta': request_meta,
                'request_body': psycopg2.Binary(request.body),
                'response_url': response.url,
                'response_meta': response_meta,
                'response_body': psycopg2.Binary(response.body)
            }
            self.cursor.execute(query, data)
        except DatabaseError, e:
            self.db.rollback()
            err = "[HTTP Cache] Error: failed to save cache to database: %s" % str(
                e)
            if 'duplicate' not in err.lower():  # ignore 'duplicate' key errors
                logging.error(err)
                if not hasattr(spider, 'errors'):
                    spider.errors = []
                spider.errors.append(
                    'HTTP Cache failed. Please contact Yuri <*****@*****.**>'
                )
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     meta_data = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     data = (
         ("meta_data", json.dumps(meta_data)),
         ("response_headers", headers_dict_to_raw(response.headers)),
         ("response_body", response.body),
         ("request_headers", headers_dict_to_raw(request.headers)),
         ("request_body", request.body),
     )
     for type_, value in data:
         key = self.key_for(spider, request, type_)
         self.client.set(key, value, self.expiration_secs)
예제 #15
0
    def test_headers_dict_to_raw_listtuple(self):
        dct = OrderedDict([(b'Content-type', [b'text/html']),
                           (b'Accept', [b'gzip'])])
        self.assertEqual(headers_dict_to_raw(dct),
                         b'Content-type: text/html\r\nAccept: gzip')

        dct = OrderedDict([(b'Content-type', (b'text/html', )),
                           (b'Accept', (b'gzip', ))])
        self.assertEqual(headers_dict_to_raw(dct),
                         b'Content-type: text/html\r\nAccept: gzip')

        dct = OrderedDict([(b'Cookie', (b'val001', b'val002')),
                           (b'Accept', b'gzip')])
        self.assertEqual(headers_dict_to_raw(dct),
                         b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip')

        dct = OrderedDict([(b'Cookie', [b'val001', b'val002']),
                           (b'Accept', b'gzip')])
        self.assertEqual(headers_dict_to_raw(dct),
                         b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip')
예제 #16
0
 def store_response(self, spider, request, response):
     # TODO: Use a buffer instead of sending the cache files one by one
     """Store the given response in the cache."""
     keyname = self._get_request_path(request)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     keydata = {
         'meta': metadata,
         'response_headers': headers_dict_to_raw(response.headers),
         'response_body': response.body,
         'request_headers': headers_dict_to_raw(request.headers),
         'request_body': request.body
     }
     self.put_object_to_key(pickle.dumps(keydata), self.bucket_name,
                            keyname)
예제 #17
0
    def store_response(self, spider, request, response):
        if response.status == 302:
            return
        path = functools.partial(storage_path, request)
        metadata = {
            'url': request.url,
            'method': request.method,
            'status': response.status,
            'response_url': response.url,
            'timestamp': time(),
        }

        pairs = (
            ('meta', repr(metadata).encode('utf8')),
            ('pickled_meta', pickle.dumps(metadata, protocol=2)),
            ('request_headers', headers_dict_to_raw(request.headers)),
            ('request_body', request.body),
            ('response_headers', headers_dict_to_raw(response.headers)),
            ('response_body', response.body),
        )
        for key, body in pairs:
            send_s3_text(self.bucket, path(key), body)
 def setUp(self):
     self.crawler = get_crawler(Spider)
     self.spider = self.crawler._create_spider(self.spider_name)
     self.tmpdir = tempfile.mkdtemp()
     self.request = Request('http://www.example.com',
                            headers={'User-Agent': 'test'})
     self.response = Response('http://www.example.com',
                              headers={'Content-Type': 'text/html'},
                              body=b'test body',
                              status=202)
     self.crawler.stats.open_spider(self.spider)
     self.cached_response = {
         'meta': {
             'url': self.request.url,
             'method': self.request.method,
             'status': self.response.status,
             'response_url': self.response.url,
             'timestamp': time.time(),
         },
         'response_headers': headers_dict_to_raw(self.response.headers),
         'response_body': self.response.body,
         'request_headers': headers_dict_to_raw(self.request.headers),
         'request_body': self.request.body
     }
     self.pickled_cached_response = pickle.dumps(self.cached_response)
     self.get_object_response = {
         'Body': StreamingBody(
             io.BytesIO(self.pickled_cached_response),
             len(self.pickled_cached_response)
         )
     }
     self.gzipped_pickled_cached_response = gzip.compress(self.pickled_cached_response)
     self.get_object_response_gziped = {
         'Body': StreamingBody(
             io.BytesIO(self.gzipped_pickled_cached_response),
             len(self.gzipped_pickled_cached_response)
         )
     }
    def _save_cache(self, spider, request, response):
        key = self._get_request_key(spider, request)
        request_meta = {
            'headers': headers_dict_to_raw(request.headers),
            'method': request.method
        }

        response_meta = {
            'headers': headers_dict_to_raw(response.headers),
            'status': str(response.status
                          )  # this is needed to make all values of str type
        }
        data = {
            'hashkey': key,
            'ts': time(),
            'request_url': request.url,
            'request_meta': request_meta,
            'request_body': bytearray(request.body),
            'response_url': response.url,
            'response_meta': response_meta,
            'response_body': bytearray(response.body)
        }
        self.ssdb.setx(key, data, self.expiration_secs)
예제 #20
0
파일: httpcache.py 프로젝트: floppya/scrapy
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not exists(rpath):
         os.makedirs(rpath)
     metadata = {
         'url': request.url,
         'method': request.method,
         'status': response.status,
         'response_url': response.url,
         'timestamp': time(),
     }
     with open(join(rpath, 'meta'), 'wb') as f:
         f.write(repr(metadata))
     with open(join(rpath, 'pickled_meta'), 'wb') as f:
         pickle.dump(metadata, f, protocol=2)
     with open(join(rpath, 'response_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(response.headers))
     with open(join(rpath, 'response_body'), 'wb') as f:
         f.write(response.body)
     with open(join(rpath, 'request_headers'), 'wb') as f:
         f.write(headers_dict_to_raw(request.headers))
     with open(join(rpath, 'request_body'), 'wb') as f:
         f.write(request.body)
예제 #21
0
 def store_response(self, spider, request, response):
     """Store the given response in the cache."""
     rpath = self._get_request_path(spider, request)
     if not exists(rpath):
         os.makedirs(rpath)
     metadata = {
         "url": request.url,
         "method": request.method,
         "status": response.status,
         "response_url": response.url,
         "timestamp": time(),
     }
     with open(join(rpath, "meta"), "wb") as f:
         f.write(repr(metadata))
     with open(join(rpath, "pickled_meta"), "wb") as f:
         pickle.dump(metadata, f, protocol=2)
     with open(join(rpath, "response_headers"), "wb") as f:
         f.write(headers_dict_to_raw(response.headers))
     with open(join(rpath, "response_body"), "wb") as f:
         f.write(response.body)
     with open(join(rpath, "request_headers"), "wb") as f:
         f.write(headers_dict_to_raw(request.headers))
     with open(join(rpath, "request_body"), "wb") as f:
         f.write(request.body)
예제 #22
0
파일: test_http.py 프로젝트: scrapy/w3lib
    def test_headers_dict_to_raw_listtuple(self):
        dct: HeadersDictInput = OrderedDict([(b"Content-type", [b"text/html"]),
                                             (b"Accept", [b"gzip"])])
        self.assertEqual(headers_dict_to_raw(dct),
                         b"Content-type: text/html\r\nAccept: gzip")

        dct = OrderedDict([(b"Content-type", (b"text/html", )),
                           (b"Accept", (b"gzip", ))])
        self.assertEqual(headers_dict_to_raw(dct),
                         b"Content-type: text/html\r\nAccept: gzip")

        dct = OrderedDict([(b"Cookie", (b"val001", b"val002")),
                           (b"Accept", b"gzip")])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b"Cookie: val001\r\nCookie: val002\r\nAccept: gzip",
        )

        dct = OrderedDict([(b"Cookie", [b"val001", b"val002"]),
                           (b"Accept", b"gzip")])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b"Cookie: val001\r\nCookie: val002\r\nAccept: gzip",
        )
예제 #23
0
파일: test_http.py 프로젝트: scrapy/w3lib
    def test_headers_dict_to_raw_listtuple(self):
        dct = OrderedDict([
            (b'Content-type', [b'text/html']),
            (b'Accept', [b'gzip'])
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b'Content-type: text/html\r\nAccept: gzip'
        )

        dct = OrderedDict([
            (b'Content-type', (b'text/html',)),
            (b'Accept', (b'gzip',))
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b'Content-type: text/html\r\nAccept: gzip'
        )

        dct = OrderedDict([
            (b'Cookie', (b'val001', b'val002')),
            (b'Accept', b'gzip')
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
        )

        dct = OrderedDict([
            (b'Cookie', [b'val001', b'val002']),
            (b'Accept', b'gzip')
        ])
        self.assertEqual(
            headers_dict_to_raw(dct),
            b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip'
        )
예제 #24
0
 def to_string(self):
     return headers_dict_to_raw(self)
예제 #25
0
파일: headers.py 프로젝트: 01-/scrapy
 def to_string(self):
     return headers_dict_to_raw(self)
예제 #26
0
파일: test_http.py 프로젝트: scrapy/w3lib
 def test_headers_dict_to_raw(self):
     dct = OrderedDict([(b"Content-type", b"text/html"),
                        (b"Accept", b"gzip")])
     self.assertEqual(headers_dict_to_raw(dct),
                      b"Content-type: text/html\r\nAccept: gzip")
예제 #27
0
파일: test_http.py 프로젝트: scrapy/w3lib
 def test_headers_raw_dict_none(self):
     self.assertIsNone(headers_raw_to_dict(None))
     self.assertIsNone(headers_dict_to_raw(None))
예제 #28
0
 def test_headers_dict_to_raw(self):
     dct = OrderedDict([(b'Content-type', b'text/html'),
                        (b'Accept', b'gzip')])
     self.assertEqual(headers_dict_to_raw(dct),
                      b'Content-type: text/html\r\nAccept: gzip')
예제 #29
0
파일: test_http.py 프로젝트: scrapy/w3lib
 def test_headers_raw_dict_none(self):
     self.assertIsNone(headers_raw_to_dict(None))
     self.assertIsNone(headers_dict_to_raw(None))