def store_response(self, spider, request, response): """Store the given response in the mongo.""" key = request_fingerprint(request) response_headers = headers_dict_to_raw(response.headers) response_body = self._get_body(response.headers, response.body) request_headers = headers_dict_to_raw(request.headers) request_body = self._get_body(request.headers, request.body) stored_data = { 'metadata': { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), }, 'response_headers': response_headers, 'response_body': response_body, 'request_headers': request_headers, 'request_body': request_body, } #print stored_data try: self.collection.insert({"_id": key, "value": stored_data}) except Exception, e: print e.message pass
def store_response(self, spider: TSpider, request: TRequest, response: TResponse) -> None: """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { "url": request.url, "method": request.method, "status": response.status, "response_url": response.url, "timestamp": time(), } with self._open(os.path.join(rpath, "meta"), "wb") as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, "pickled_meta"), "wb") as f: pickle.dump(metadata, f, protocol=2) with self._open(os.path.join(rpath, "response_headers"), "wb") as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, "response_body"), "wb") as f: f.write(response.body) with self._open(os.path.join(rpath, "request_headers"), "wb") as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, "request_body"), "wb") as f: f.write(request.body)
def store_response(self, spider, request, response): """Store the given response in the cache.""" # 将给定的响应存储在缓存中。 rpath = self._get_request_path(spider, request) if not os.path.exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with self._open(os.path.join(rpath, 'meta'), 'wb') as f: f.write(to_bytes(repr(metadata))) with self._open(os.path.join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=4) with self._open(os.path.join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with self._open(os.path.join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with self._open(os.path.join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with self._open(os.path.join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
def store_response(rpath, request, response): """ Store the given response in the cache """ if not exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with open(join(rpath, 'meta'), 'wb') as f: f.write(repr(metadata)) with open(join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=2) with open(join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with open(join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with open(join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with open(join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
def store_response(self, spider, request, response): """Store the given response in the mongo.""" key = request_fingerprint(request) response_headers = headers_dict_to_raw(response.headers) response_body = self._get_body(response.headers, response.body) request_headers = headers_dict_to_raw(request.headers) request_body = self._get_body(request.headers, request.body) stored_data = { 'metadata': { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), }, 'response_headers': response_headers, 'response_body': response_body, 'request_headers': request_headers, 'request_body': request_body, } #print stored_data collection_index = int(key, 16) % 1000 collection_name = 'collection' + str(collection_index) collection = self.db[collection_name] try: collection.insert({"_id": key, "value": stored_data}) print "-----------------Write cache %s------------------" % collection_name except Exception, e: print e.message pass
def test_headers_dict_to_raw_wrong_values(self): dct = OrderedDict([ (b'Content-type', 0), ]) self.assertEqual(headers_dict_to_raw(dct), b'') dct = OrderedDict([(b'Content-type', 1), (b'Accept', [b'gzip'])]) self.assertEqual(headers_dict_to_raw(dct), b'Accept: gzip')
def test_headers_dict_to_raw_wrong_values(self): dct: HeadersDictInput = OrderedDict([ (b"Content-type", 0), ]) self.assertEqual(headers_dict_to_raw(dct), b"") self.assertEqual(headers_dict_to_raw(dct), b"") dct = OrderedDict([(b"Content-type", 1), (b"Accept", [b"gzip"])]) self.assertEqual(headers_dict_to_raw(dct), b"Accept: gzip")
def test_headers_dict_to_raw_wrong_values(self): dct = OrderedDict([ (b'Content-type', 0), ]) self.assertEqual( headers_dict_to_raw(dct), b'' ) dct = OrderedDict([ (b'Content-type', 1), (b'Accept', [b'gzip']) ]) self.assertEqual( headers_dict_to_raw(dct), b'Accept: gzip' )
def store_response(self, spider, request, response): """Store the given response in the cache.""" data = { '_id': self._inverse_url(request.url), 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), 'response_body': response.body_as_unicode(), 'response_headers': headers_dict_to_raw(response.headers), 'request_headers': headers_dict_to_raw(request.headers), 'request_body': request.body, 'encoding': response.encoding } _id = self._inverse_url(request.url) self.bucket.add(_id, 0, 0, json.dumps(data))
def _update_cache(self, spider, request, response): key = self._get_request_key(spider, request) request_meta = { 'headers': headers_dict_to_raw(request.headers), 'method': request.method } response_meta = { 'headers': headers_dict_to_raw(response.headers), 'status': str(response.status ) # this is needed to make all values of str type } try: query = """ UPDATE http_cache_binary SET ts=%(ts)s, request_url=%(request_url)s, request_meta=%(request_meta)s, request_body=%(request_body)s, response_url=%(response_url)s, response_meta=%(response_meta)s, response_body=%(response_body)s WHERE hashkey=%(hashkey)s """ data = { 'hashkey': key, 'ts': datetime.datetime.now(), 'request_url': request.url, 'request_meta': request_meta, 'request_body': psycopg2.Binary(request.body), 'response_url': response.url, 'response_meta': response_meta, 'response_body': psycopg2.Binary(response.body) } self.cursor.execute(query, data) except DatabaseError, e: self.db.rollback() err = "[HTTP Cache] Error: failed to update cache in database: %s" % str( e) logging.error(err) if not hasattr(spider, 'errors'): spider.errors = [] spider.errors.append( 'HTTP Cache failed. Please contact Yuri <*****@*****.**>' )
def test_headers_dict_to_raw(self): dct = OrderedDict([ (b'Content-type', b'text/html'), (b'Accept', b'gzip') ]) self.assertEqual( headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip' )
def _save_cache(self, spider, request, response): key = self._get_request_key(spider, request) request_meta = { 'headers': headers_dict_to_raw(request.headers), 'method': request.method } response_meta = { 'headers': headers_dict_to_raw(response.headers), 'status': str(response.status ) # this is needed to make all values of str type } try: query = """ INSERT INTO http_cache_binary (hashkey, ts, request_url, request_meta, request_body, response_url, response_meta, response_body) VALUES (%(hashkey)s, %(ts)s, %(request_url)s, %(request_meta)s, %(request_body)s, %(response_url)s, %(response_meta)s, %(response_body)s) """ data = { 'hashkey': key, 'ts': datetime.datetime.now(), 'request_url': request.url, 'request_meta': request_meta, 'request_body': psycopg2.Binary(request.body), 'response_url': response.url, 'response_meta': response_meta, 'response_body': psycopg2.Binary(response.body) } self.cursor.execute(query, data) except DatabaseError, e: self.db.rollback() err = "[HTTP Cache] Error: failed to save cache to database: %s" % str( e) if 'duplicate' not in err.lower(): # ignore 'duplicate' key errors logging.error(err) if not hasattr(spider, 'errors'): spider.errors = [] spider.errors.append( 'HTTP Cache failed. Please contact Yuri <*****@*****.**>' )
def store_response(self, spider, request, response): """Store the given response in the cache.""" meta_data = { "url": request.url, "method": request.method, "status": response.status, "response_url": response.url, "timestamp": time(), } data = ( ("meta_data", json.dumps(meta_data)), ("response_headers", headers_dict_to_raw(response.headers)), ("response_body", response.body), ("request_headers", headers_dict_to_raw(request.headers)), ("request_body", request.body), ) for type_, value in data: key = self.key_for(spider, request, type_) self.client.set(key, value, self.expiration_secs)
def test_headers_dict_to_raw_listtuple(self): dct = OrderedDict([(b'Content-type', [b'text/html']), (b'Accept', [b'gzip'])]) self.assertEqual(headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip') dct = OrderedDict([(b'Content-type', (b'text/html', )), (b'Accept', (b'gzip', ))]) self.assertEqual(headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip') dct = OrderedDict([(b'Cookie', (b'val001', b'val002')), (b'Accept', b'gzip')]) self.assertEqual(headers_dict_to_raw(dct), b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip') dct = OrderedDict([(b'Cookie', [b'val001', b'val002']), (b'Accept', b'gzip')]) self.assertEqual(headers_dict_to_raw(dct), b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip')
def store_response(self, spider, request, response): # TODO: Use a buffer instead of sending the cache files one by one """Store the given response in the cache.""" keyname = self._get_request_path(request) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } keydata = { 'meta': metadata, 'response_headers': headers_dict_to_raw(response.headers), 'response_body': response.body, 'request_headers': headers_dict_to_raw(request.headers), 'request_body': request.body } self.put_object_to_key(pickle.dumps(keydata), self.bucket_name, keyname)
def store_response(self, spider, request, response): if response.status == 302: return path = functools.partial(storage_path, request) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } pairs = ( ('meta', repr(metadata).encode('utf8')), ('pickled_meta', pickle.dumps(metadata, protocol=2)), ('request_headers', headers_dict_to_raw(request.headers)), ('request_body', request.body), ('response_headers', headers_dict_to_raw(response.headers)), ('response_body', response.body), ) for key, body in pairs: send_s3_text(self.bucket, path(key), body)
def setUp(self): self.crawler = get_crawler(Spider) self.spider = self.crawler._create_spider(self.spider_name) self.tmpdir = tempfile.mkdtemp() self.request = Request('http://www.example.com', headers={'User-Agent': 'test'}) self.response = Response('http://www.example.com', headers={'Content-Type': 'text/html'}, body=b'test body', status=202) self.crawler.stats.open_spider(self.spider) self.cached_response = { 'meta': { 'url': self.request.url, 'method': self.request.method, 'status': self.response.status, 'response_url': self.response.url, 'timestamp': time.time(), }, 'response_headers': headers_dict_to_raw(self.response.headers), 'response_body': self.response.body, 'request_headers': headers_dict_to_raw(self.request.headers), 'request_body': self.request.body } self.pickled_cached_response = pickle.dumps(self.cached_response) self.get_object_response = { 'Body': StreamingBody( io.BytesIO(self.pickled_cached_response), len(self.pickled_cached_response) ) } self.gzipped_pickled_cached_response = gzip.compress(self.pickled_cached_response) self.get_object_response_gziped = { 'Body': StreamingBody( io.BytesIO(self.gzipped_pickled_cached_response), len(self.gzipped_pickled_cached_response) ) }
def _save_cache(self, spider, request, response): key = self._get_request_key(spider, request) request_meta = { 'headers': headers_dict_to_raw(request.headers), 'method': request.method } response_meta = { 'headers': headers_dict_to_raw(response.headers), 'status': str(response.status ) # this is needed to make all values of str type } data = { 'hashkey': key, 'ts': time(), 'request_url': request.url, 'request_meta': request_meta, 'request_body': bytearray(request.body), 'response_url': response.url, 'response_meta': response_meta, 'response_body': bytearray(response.body) } self.ssdb.setx(key, data, self.expiration_secs)
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not exists(rpath): os.makedirs(rpath) metadata = { 'url': request.url, 'method': request.method, 'status': response.status, 'response_url': response.url, 'timestamp': time(), } with open(join(rpath, 'meta'), 'wb') as f: f.write(repr(metadata)) with open(join(rpath, 'pickled_meta'), 'wb') as f: pickle.dump(metadata, f, protocol=2) with open(join(rpath, 'response_headers'), 'wb') as f: f.write(headers_dict_to_raw(response.headers)) with open(join(rpath, 'response_body'), 'wb') as f: f.write(response.body) with open(join(rpath, 'request_headers'), 'wb') as f: f.write(headers_dict_to_raw(request.headers)) with open(join(rpath, 'request_body'), 'wb') as f: f.write(request.body)
def store_response(self, spider, request, response): """Store the given response in the cache.""" rpath = self._get_request_path(spider, request) if not exists(rpath): os.makedirs(rpath) metadata = { "url": request.url, "method": request.method, "status": response.status, "response_url": response.url, "timestamp": time(), } with open(join(rpath, "meta"), "wb") as f: f.write(repr(metadata)) with open(join(rpath, "pickled_meta"), "wb") as f: pickle.dump(metadata, f, protocol=2) with open(join(rpath, "response_headers"), "wb") as f: f.write(headers_dict_to_raw(response.headers)) with open(join(rpath, "response_body"), "wb") as f: f.write(response.body) with open(join(rpath, "request_headers"), "wb") as f: f.write(headers_dict_to_raw(request.headers)) with open(join(rpath, "request_body"), "wb") as f: f.write(request.body)
def test_headers_dict_to_raw_listtuple(self): dct: HeadersDictInput = OrderedDict([(b"Content-type", [b"text/html"]), (b"Accept", [b"gzip"])]) self.assertEqual(headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip") dct = OrderedDict([(b"Content-type", (b"text/html", )), (b"Accept", (b"gzip", ))]) self.assertEqual(headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip") dct = OrderedDict([(b"Cookie", (b"val001", b"val002")), (b"Accept", b"gzip")]) self.assertEqual( headers_dict_to_raw(dct), b"Cookie: val001\r\nCookie: val002\r\nAccept: gzip", ) dct = OrderedDict([(b"Cookie", [b"val001", b"val002"]), (b"Accept", b"gzip")]) self.assertEqual( headers_dict_to_raw(dct), b"Cookie: val001\r\nCookie: val002\r\nAccept: gzip", )
def test_headers_dict_to_raw_listtuple(self): dct = OrderedDict([ (b'Content-type', [b'text/html']), (b'Accept', [b'gzip']) ]) self.assertEqual( headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip' ) dct = OrderedDict([ (b'Content-type', (b'text/html',)), (b'Accept', (b'gzip',)) ]) self.assertEqual( headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip' ) dct = OrderedDict([ (b'Cookie', (b'val001', b'val002')), (b'Accept', b'gzip') ]) self.assertEqual( headers_dict_to_raw(dct), b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip' ) dct = OrderedDict([ (b'Cookie', [b'val001', b'val002']), (b'Accept', b'gzip') ]) self.assertEqual( headers_dict_to_raw(dct), b'Cookie: val001\r\nCookie: val002\r\nAccept: gzip' )
def to_string(self): return headers_dict_to_raw(self)
def test_headers_dict_to_raw(self): dct = OrderedDict([(b"Content-type", b"text/html"), (b"Accept", b"gzip")]) self.assertEqual(headers_dict_to_raw(dct), b"Content-type: text/html\r\nAccept: gzip")
def test_headers_raw_dict_none(self): self.assertIsNone(headers_raw_to_dict(None)) self.assertIsNone(headers_dict_to_raw(None))
def test_headers_dict_to_raw(self): dct = OrderedDict([(b'Content-type', b'text/html'), (b'Accept', b'gzip')]) self.assertEqual(headers_dict_to_raw(dct), b'Content-type: text/html\r\nAccept: gzip')