async def test_mutliple_values_for_sample_url(self): a = Response(404, {}) a.set_content(b"a", True) b = Response(200, {}) b.set_content(b"response", True) self.engine.response = [a, b] entry = self.create_entry("http://example.com/test", response_content="response") self.rule.confirmation_factor = 5 await self.runner.perform_ok(entry) self.assertTrue(entry.result.soft404)
def test_content_use_encoding_in_mimetype_if_present(self): raw_bytes = "abcdé".encode("iso-8859-1") response = Response(200, {"content-type": "text/html; charset=iso-8859-1"}) response.set_content(raw_bytes, True) self.assertEqual(response.content, raw_bytes.decode("iso-8859-1"))
def test_content_find_content_encoding_if_utf8_fails(self): raw_bytes = '<html><head>' \ '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">' \ '</head><body>abcdé</body></html>'.encode("iso-8859-1") response = Response(200, {}) response.set_content(raw_bytes, True) self.assertEqual(response.content, raw_bytes.decode("iso-8859-1"))
def test_partial_content_raise_unicode_decode_error_if_error_not_caused_by_truncated_content( self): raw_bytes = b"\x80" * 10 # Not an encoded utf-8 string response = Response(200, {}) response.set_content(raw_bytes, False) with self.assertRaises(UnicodeDecodeError): response.partial_content
def test_partial_content_ignore_truncated_utf8_characters_when_decoding( self): raw_bytes = "abcdé".encode("utf-8")[: -1] # 'é' 2nd byte will be missing response = Response(200, {}) response.set_content(raw_bytes, False) self.assertEqual(response.partial_content, "abcd")
async def test_add_hash_of_raw_content_if_response_content_of_sample_is_not_text(self): bytes = b'Invalid UTF8 x\x80Z"' sample_response = Response(200, {}) sample_response.set_content(bytes, True) self.engine.mock.perform_high_priority.return_value = sample_response await self.filter.after_response(Entry.create("http://example.com/?wsdl", response=StaticResponse(200, {}, "123"))) self.assertEqual(self.kb.query_samples["example.com/"], {"md5": hashlib.md5(bytes).digest()})
def test_dont_match_if_simhash_in_knowledge_base_but_response_content_is_not_text( self): raw = b'x\x80Z"\x1a\x98\x8ey\xef?B\xd7\xc5\xbf\xd4\x18' response = Response(200, {}) response.set_content(raw, True) self.assertFalse( self.rule._match(response, { "code": 200, "content_simhash": 12345 }))
def test_content_dont_decode_twice_if_utf8_is_invalid(self): raw_bytes = MagicMock() raw_bytes.decode.side_effect = UnicodeDecodeError( "utf-8", b"abc", 1, 2, "reason") response = Response(200, {"content-type": "text/html; charset=utf-8"}) response.set_content(raw_bytes, at_eof=True) with self.assertRaises(UnicodeDecodeError): response.content raw_bytes.decode.assert_called_once_with("utf-8")
async def test_content_that_is_not_text_never_match_content_simhash_of_sample(self): raw = b'Invalid UTF8 x\x80Z"' response = Response(200, {}) response.set_content(raw, True) hash = self.filter._hash_response(StaticResponse(200, {}, "content")) self.kb.query_samples["example.com/"] = {"simhash": hash} with patch("tachyon.heuristics.rejectignoredquery.Simhash") as Simhash: await self.filter.after_response(Entry.create("http://example.com/?wsdl", response=response)) Simhash.assert_not_called()
async def test_compare_hash_of_raw_content_if_raw_content_hash_in_knowledge_base( self): raw = b'Invalid UTF8 x\x80Z"' _hash = hashlib.md5(raw).digest() self.kb.soft_404_responses["http://example.com/"][ "/\l"] = ContentSignature(code=200, content_hash=_hash) self.rule.performed["http://example.com/"] = {"/\l": None} response = Response(200, {}) response.set_content(raw, True) entry = Entry.create("http://example.com/test", response=response) await self.runner.perform_ok(entry) self.assertTrue(entry.result.soft404)
async def test_compare_hash_of_raw_content_if_no_simhash_in_knowledge_base( self): raw = b'x\x80Z"\x1a\x98\x8ey\xef?B\xd7\xc5\xbf\xd4\x18' _hash = hashlib.md5(raw).digest() self.kb.soft_404_responses["http://example.com/"]["/\l"] = { "code": 200, "raw_content_hash": _hash } self.rule.performed["http://example.com/"] = {"/\l": None} response = Response(200, {}) response.set_content(raw, True) with self.assertRaises(RejectRequest): await self.rule.after_response( Entry.create("http://example.com/test", response=response))
async def test_add_hash_of_raw_content_if_response_content_of_sample_is_not_text( self): bytes = b'Invalid UTF8 x\x80Z"' sample_response = Response(200, {}) sample_response.set_content(bytes, True) self.engine.response = sample_response await self.runner.perform_ok( self.create_entry("http://example.com/test", response_content="response")) self.assertEqual( self.kb.soft_404_responses["http://example.com/"], { "/\l": [ ContentSignature(code=200, content_hash=hashlib.md5(bytes).digest(), content_sample=ANY) ] })
async def test_add_hash_of_raw_content_if_response_content_is_not_text( self): response = Response(200, {}) response.set_content(b'x\x80Z"\x1a\x98\x8ey\xef?B\xd7\xc5\xbf\xd4\x18', True) self.engine.response = response await self.rule.after_response( self.create_entry("http://example.com/test", response_content="response")) self.assertEqual( self.kb.soft_404_responses["http://example.com/"], { "/\l": { "code": 200, "raw_content_hash": hashlib.md5( b'x\x80Z"\x1a\x98\x8ey\xef?B\xd7\xc5\xbf\xd4\x18'). digest() } })
def test_partial_content_return_decoded_truncated_raw_bytes(self): raw_bytes = b"abcdefg" response = Response(200, {}) response.set_content(raw_bytes, False) self.assertEqual(response.partial_content, "abcdefg")