def setUp(self): self.mw = HttpCompression(get_engine())
class HttpCompressionTest(unittest.TestCase): def setUp(self): self.mw = HttpCompression(get_engine()) def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() sample_file, content_encoding = FORMAT[coding] with open(join(SAMPLE_DIR, sample_file), "rb") as sample: body = sample.read() headers = { "Server": "Yaws/1.49 Yet Another Web Server", "Date": "Sun, 08 Mar 2009 00:41:03 GMT", "Content-Length": len(body), "Content-Type": "text/html", "Content-Encoding": content_encoding, } response = Response("http://github.com/", body=body, headers=headers) response.request = Request("http://github.com/", headers={"Accept-Encoding": "gzip,deflate"}) return response def test_process_request(self): request = Request("http://github.com/") self.assertNotIn("Accept-Encoding", request.headers) request = self.mw.process_request(request) self.assertEqual(request.headers.get("Accept-Encoding"), "gzip,deflate") def test_process_response_gzip(self): response = self._getresponse("gzip") self.assertEqual(response.headers["Content-Encoding"], "gzip") new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith("<!DOCTYPE")) self.assertNotIn("Content-Encoding", new_response.headers) def test_process_response_rawdeflate(self): response = self._getresponse("rawdeflate") self.assertEqual(response.headers["Content-Encoding"], "deflate") new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith("<!DOCTYPE")) self.assertNotIn("Content-Encoding", new_response.headers) def test_process_response_zlibdelate(self): response = self._getresponse("zlibdeflate") self.assertEqual(response.headers["Content-Encoding"], "deflate") new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith("<!DOCTYPE")) self.assertNotIn("Content-Encoding", new_response.headers) def test_process_response_plain(self): response = Response("http://crawlmitest.org", body="<!DOCTYPE...") self.assertFalse(response.headers.get("Content-Encoding")) new_response = self.mw.process_response(response) self.assertIs(new_response, response) self.assertTrue(new_response.body.startswith("<!DOCTYPE")) def test_multipleencodings(self): response = self._getresponse("gzip") response.headers["Content-Encoding"] = ["uuencode", "gzip"] new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertEqual(new_response.headers.getlist("Content-Encoding"), ["uuencode"]) def test_process_response_encoding(self): headers = {"Content-Type": "text/html", "Content-Encoding": "gzip"} f = StringIO() plain_body = "<html><head><title>Some page</title>" zf = GzipFile(fileobj=f, mode="wb") zf.write(plain_body) zf.close() response = Response("http://github.com/", headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plain_body) self.assertEqual(new_response.encoding, normalize_encoding("utf-8")) def test_process_response_encoding_inside_body(self): headers = {"Content-Type": "text/html", "Content-Encoding": "gzip"} f = StringIO() plainbody = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" zf = GzipFile(fileobj=f, mode="wb") zf.write(plainbody) zf.close() response = Response("http;//www.example.com/", headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding("gb2312")) def test_process_response_force_recalculate_encoding(self): headers = {"Content-Type": "text/html", "Content-Encoding": "gzip"} f = StringIO() plainbody = """<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">""" zf = GzipFile(fileobj=f, mode="wb") zf.write(plainbody) zf.close() response = HtmlResponse("http;//www.example.com/page.html", headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding("gb2312")) def test_max_length(self): request = Request("http://github.com/", meta={"DOWNLOAD_SIZE_LIMIT": 74839}) response = self._getresponse("rawdeflate") response.request = request self.assertRaises(DecompressSizeError, self.mw.process_response, response) # don't raise error response.meta["DOWNLOAD_SIZE_LIMIT"] = 74840 self.mw.process_response(response)
class HttpCompressionTest(unittest.TestCase): def setUp(self): self.mw = HttpCompression(get_engine()) def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() sample_file, content_encoding = FORMAT[coding] with open(join(SAMPLE_DIR, sample_file), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': content_encoding, } response = Response('http://github.com/', body=body, headers=headers) response.request = Request('http://github.com/', headers={'Accept-Encoding': 'gzip,deflate'}) return response def test_process_request(self): request = Request('http://github.com/') self.assertNotIn('Accept-Encoding', request.headers) request = self.mw.process_request(request) self.assertEqual(request.headers.get('Accept-Encoding'), 'gzip,deflate') def test_process_response_gzip(self): response = self._getresponse('gzip') self.assertEqual(response.headers['Content-Encoding'], 'gzip') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_rawdeflate(self): response = self._getresponse('rawdeflate') self.assertEqual(response.headers['Content-Encoding'], 'deflate') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_zlibdelate(self): response = self._getresponse('zlibdeflate') self.assertEqual(response.headers['Content-Encoding'], 'deflate') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_plain(self): response = Response('http://crawlmitest.org', body='<!DOCTYPE...') self.assertFalse(response.headers.get('Content-Encoding')) new_response = self.mw.process_response(response) self.assertIs(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) def test_multipleencodings(self): response = self._getresponse('gzip') response.headers['Content-Encoding'] = ['uuencode', 'gzip'] new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertEqual(new_response.headers.getlist('Content-Encoding'), ['uuencode']) def test_process_response_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plain_body = '<html><head><title>Some page</title>' zf = GzipFile(fileobj=f, mode='wb') zf.write(plain_body) zf.close() response = Response('http://github.com/', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plain_body) self.assertEqual(new_response.encoding, normalize_encoding('utf-8')) def test_process_response_encoding_inside_body(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">''' zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = Response('http;//www.example.com/', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding('gb2312')) def test_process_response_force_recalculate_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">''' zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = HtmlResponse('http;//www.example.com/page.html', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding('gb2312')) def test_max_length(self): request = Request('http://github.com/', meta={'DOWNLOAD_SIZE_LIMIT': 74839}) response = self._getresponse('rawdeflate') response.request = request self.assertRaises(DecompressSizeError, self.mw.process_response, response) # don't raise error response.meta['DOWNLOAD_SIZE_LIMIT'] = 74840 self.mw.process_response(response)
def setUp(self): self.mw = HttpCompression(get_engine())
class HttpCompressionTest(unittest.TestCase): def setUp(self): self.mw = HttpCompression(get_engine()) def _getresponse(self, coding): if coding not in FORMAT: raise ValueError() sample_file, content_encoding = FORMAT[coding] with open(join(SAMPLE_DIR, sample_file), 'rb') as sample: body = sample.read() headers = { 'Server': 'Yaws/1.49 Yet Another Web Server', 'Date': 'Sun, 08 Mar 2009 00:41:03 GMT', 'Content-Length': len(body), 'Content-Type': 'text/html', 'Content-Encoding': content_encoding, } response = Response('http://github.com/', body=body, headers=headers) response.request = Request('http://github.com/', headers={'Accept-Encoding': 'gzip,deflate'}) return response def test_process_request(self): request = Request('http://github.com/') self.assertNotIn('Accept-Encoding', request.headers) request = self.mw.process_request(request) self.assertEqual(request.headers.get('Accept-Encoding'), 'x-gzip,gzip,deflate') def test_process_response_gzip(self): response = self._getresponse('gzip') self.assertEqual(response.headers['Content-Encoding'], 'gzip') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_rawdeflate(self): response = self._getresponse('rawdeflate') self.assertEqual(response.headers['Content-Encoding'], 'deflate') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_zlibdelate(self): response = self._getresponse('zlibdeflate') self.assertEqual(response.headers['Content-Encoding'], 'deflate') new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) self.assertNotIn('Content-Encoding', new_response.headers) def test_process_response_plain(self): response = Response('http://crawlmitest.org', body='<!DOCTYPE...') self.assertFalse(response.headers.get('Content-Encoding')) new_response = self.mw.process_response(response) self.assertIs(new_response, response) self.assertTrue(new_response.body.startswith('<!DOCTYPE')) def test_multipleencodings(self): response = self._getresponse('gzip') response.headers['Content-Encoding'] = ['uuencode', 'gzip'] new_response = self.mw.process_response(response) self.assertIsNot(new_response, response) self.assertEqual(new_response.headers.getlist('Content-Encoding'), ['uuencode']) def test_process_response_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plain_body = '<html><head><title>Some page</title>' zf = GzipFile(fileobj=f, mode='wb') zf.write(plain_body) zf.close() response = Response('http://github.com/', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plain_body) self.assertEqual(new_response.encoding, normalize_encoding('utf-8')) def test_process_response_encoding_inside_body(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">''' zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = Response('http;//www.example.com/', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding('gb2312')) def test_process_response_force_recalculate_encoding(self): headers = { 'Content-Type': 'text/html', 'Content-Encoding': 'gzip', } f = StringIO() plainbody = '''<html><head><title>Some page</title><meta http-equiv="Content-Type" content="text/html; charset=gb2312">''' zf = GzipFile(fileobj=f, mode='wb') zf.write(plainbody) zf.close() response = HtmlResponse('http;//www.example.com/page.html', headers=headers, body=f.getvalue()) new_response = self.mw.process_response(response) self.assertIsInstance(new_response, HtmlResponse) self.assertEqual(new_response.body, plainbody) self.assertEqual(new_response.encoding, normalize_encoding('gb2312')) def test_max_length(self): request = Request('http://github.com/', meta={'DOWNLOAD_SIZE_LIMIT': 74839}) response = self._getresponse('rawdeflate') response.request = request self.assertRaises(DecompressSizeError, self.mw.process_response, response) # don't raise error response.meta['DOWNLOAD_SIZE_LIMIT'] = 74840 self.mw.process_response(response)