def _process_hyperquack_v1(self, filename: str, scan: Any, random_measurement_id: str) -> Iterator[Row]: """Process a line of Echo/Discard/HTTP/S data in HyperQuack V1 format. https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv1.rst Args: filename: a filepath string scan: a loaded json object containing the parsed content of the line random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ for index, result in enumerate(scan.get('Results', [])): date = result['StartTime'][:10] sent_domain = _extract_domain_from_sent_field(result['Sent']) is_control = flatten_base.is_control_url(sent_domain) # Due to a bug the sent field sometimes isn't populated # when the measurement failed due to network timeout. if not sent_domain: # Control measurements come at the end, and are not counted as retries. is_control = index > scan['Retries'] if is_control: domain = "" else: domain = scan['Keyword'] else: domain = sent_domain row = { 'domain': domain, 'category': self.category_matcher.get_category(domain, is_control), 'ip': scan['Server'], 'date': date, 'start_time': result['StartTime'], 'end_time': result['EndTime'], 'anomaly': scan['Blocked'], 'success': result['Success'], 'stateful_block': scan['StatefulBlock'], 'is_control': is_control, 'controls_failed': scan['FailSanity'], 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filename), } if 'Received' in result: received = result.get('Received', '') received_fields = flatten_base.parse_received_data( self.blockpage_matcher, received, scan['Blocked']) row.update(received_fields) if 'Error' in result: row['error'] = result['Error'] yield row
def _process_satellite_blockpages(self, scan: Any, filepath: str) -> Iterator[Row]: """Process a line of Satellite blockpage data. Args: scan: a loaded json object containing the parsed content of the line filepath: a filepath string Yields: Rows, usually 2 corresponding to the fetched http and https data respectively """ row = { 'domain': scan['keyword'], 'ip': scan['ip'], 'date': scan['start_time'][:10], 'start_time': format_timestamp(scan['start_time']), 'end_time': format_timestamp(scan['end_time']), 'success': scan['fetched'], 'source': flatten_base.source_from_filename(filepath), } http = { 'https': False, } http.update(row) received_fields = flatten_base.parse_received_data( self.blockpage_matcher, scan.get('http', ''), True) http.update(received_fields) yield http https = { 'https': True, } https.update(row) received_fields = flatten_base.parse_received_data( self.blockpage_matcher, scan.get('https', ''), True) https.update(received_fields) yield https
def test_parse_received_data_https(self) -> None: """Test parsing example HTTPS data.""" # yapf: disable received = { 'status_line': '403 Forbidden', 'headers': { 'Content-Length': ['278'], 'Content-Type': ['text/html'], 'Date': ['Fri, 06 Nov 2020 20:24:19 GMT'], 'Expires': ['Fri, 06 Nov 2020 20:24:19 GMT'], 'Mime-Version': ['1.0'], 'Server': ['AkamaiGHost'], 'Set-Cookie': [ 'bm_sz=6A1BDB4DFCA371F55C598A6D50C7DC3F~YAAQtTXdWKzJ+ZR1AQAA6zY7nwmc3d1xb2D5pqi3WHoMGfNsB8zB22LP5Kz/15sxdI3d3qznv4NzhGdb6CjijzFezAd18NREhybEvZMSZe2JHkjBjli/y1ZRMgC512ln7CCHURjS03UWDIzVrpwPV3Z/h/mq00NF2+LgHsDPelEZoArYVmEwH7OtE4zHAePErKw=; Domain=.discover.com; Path=/; Expires=Sat, 07 Nov 2020 00:24:19 GMT; Max-Age=14400; HttpOnly_abck=7A29878FA7120EC680C6E591A8FF3F5A~-1~YAAQtTXdWK3J+ZR1AQAA6zY7nwR93cThkIxWHn0icKtS6Wgb6NVHSQ80nZ6I2DzczA+1qn/0rXSGZUcFvW/+7tmDF0lHsieeRwnmIydhPELwAsNLjfBMF1lJm9Y7u4ppUpD4WtbRJ1g+Qhd9CLcelH3fQ8AVmJn/jRNN8WrisA8GKuUhpfjv9Gp1aGGqzv12H8u3Ogt/9oOv4Y8nKuS7CWipsFuPftCMeTBVbPn/JsV/NzttmkuFikLj8PwmpNecqlhaH1Ra32XDl/hVsCFWaPm4wdWO3d2WDK8Em0sHzklyTV4iFo6itVlCEHQ=~-1~-1~-1; Domain=.discover.com; Path=/; Expires=Sat, 06 Nov 2021 20:24:19 GMT; Max-Age=31536000; Secure' ] }, 'body': '<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "discover.com" on this server.<P>\nReference 18b535dd581604694259a71c660\n</BODY>\n</HTML>\n', 'tls': { 'version': 771, 'cipher_suite': 49199, 'cert': 'MIIG1DCCBbygAwIBAgIQBFzDKr18mq0F13LVYhA6FjANBgkqhkiG9w0BAQsFADB1MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTQwMgYDVQQDEytEaWdpQ2VydCBTSEEyIEV4dGVuZGVkIFZhbGlkYXRpb24gU2VydmVyIENBMB4XDTIwMTAwNTAwMDAwMFoXDTIxMTAwNjEyMDAwMFowgdQxHTAbBgNVBA8MFFByaXZhdGUgT3JnYW5pemF0aW9uMRMwEQYLKwYBBAGCNzwCAQMTAkpQMRcwFQYDVQQFEw4wMTAwLTAxLTAwODgyNDELMAkGA1UEBhMCSlAxDjAMBgNVBAgTBVRva3lvMRMwEQYDVQQHEwpDaGl5b2RhLUt1MTkwNwYDVQQKEzBUb2tpbyBNYXJpbmUgYW5kIE5pY2hpZG8gRmlyZSBJbnN1cmFuY2UgQ28uIEx0ZC4xGDAWBgNVBAMTD3d3dy50YWJpa29yZS5qcDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAN+0RFcFIoCHvFTJs/+hexC5SxrKDAytiHNDzXLYaOFuf2LA+7UN33QE8dnINmV0ma7Udd1r8KmXJWJPeTxIJyskad8VNwx0oF00ENS56GYl/y37Y85DE5MQhaQwPEiyQL0TsrL/K2bNYjvEPklBVEOi1vtiOOTZWnUH86MxSe3PwmmXDaFgd3174Z8lEmi20Jl3++Tr/jNeBMw3Sg3KuLW8IUTl6+33mr3Z1u2u6yFN4d7mXlzyo0BxOwlJ1NwJbTzyFnBAfAZ2gJFVFQtuoWdgh9XIquhdFoxCfj/h9zxFK+64xJ+sXGSL5SiEZeBfmvG8SrW4OBSvHzyUSzJKCrsCAwEAAaOCAv4wggL6MB8GA1UdIwQYMBaAFD3TUKXWoK3u80pgCmXTIdT4+NYPMB0GA1UdDgQWBBQKix8NngHND9LiEWxMPAOBE6MwjDAnBgNVHREEIDAeggt0YWJpa29yZS5qcIIPd3d3LnRhYmlrb3JlLmpwMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDBLBgNVHSAERDBCMDcGCWCGSAGG/WwCATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20vQ1BTMAcGBWeBDAEBMIGIBggrBgEFBQcBAQR8MHowJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBSBggrBgEFBQcwAoZGaHR0cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0U0hBMkV4dGVuZGVkVmFsaWRhdGlvblNlcnZlckNBLmNydDAJBgNVHRMEAjAAMIIBBAYKKwYBBAHWeQIEAgSB9QSB8gDwAHYA9lyUL9F3MCIUVBgIMJRWjuNNExkzv98MLyALzE7xZOMAAAF093gqNAAABAMARzBFAiEAz0WGut1b8na4VKfulIqCPRbV+lv05YdPNT2xfWreNAYCIDU3JiavbsMjE/r0M9P2c7B07U72W4TK/PdlsKCg5t1PAHYAXNxDkv7mq0VEsV6a1FbmEDf71fpH3KFzlLJe5vbHDsoAAAF093gqgwAABAMARzBFAiApVQum+1q4C4drBI7t6aObwa5jtmWd/BHVTLPgcdhMXgIhAKv+7bC9X9wstKB0OGQbVVX/qsJ5fzf4Y8zNUaklAQiKMA0GCSqGSIb3DQEBCwUAA4IBAQAD02pESpGPgJSMTpFVm4VRufgwW95fxA/sch63U94owcOmNtrniSoOr8QwLMAVta6VFU6wddbTBd4vz8zauo4R6uAeFaiUBaFaKb5V2bONGclfjTZ7nsDxsowLracGrRx/rQjjovRo2656g5Iu898WIfADxIvsGc5CICGqLB9GvofVWNNb/DoOXf/vLQJj9m5+ZCi0CrIdh31IB/acHsQ8jWr4VlqPGiz2PIdKjBLuI9ckFbMQ/9DCTWfuJhSfwA3kk2EeUa6WlRrjDhJLasjrEmQiSIf3oywdsPspSYOkT91TFUvzjOmK/yZeApxPJmDvjxpum5GZYnn6QthKxMzL' } } expected = { 'received_status': '403 Forbidden', 'received_body': '<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "discover.com" on this server.<P>\nReference 18b535dd581604694259a71c660\n</BODY>\n</HTML>\n', 'received_tls_version': 771, 'received_tls_cipher_suite': 49199, 'received_tls_cert': 'MIIG1DCCBbygAwIBAgIQBFzDKr18mq0F13LVYhA6FjANBgkqhkiG9w0BAQsFADB1MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTQwMgYDVQQDEytEaWdpQ2VydCBTSEEyIEV4dGVuZGVkIFZhbGlkYXRpb24gU2VydmVyIENBMB4XDTIwMTAwNTAwMDAwMFoXDTIxMTAwNjEyMDAwMFowgdQxHTAbBgNVBA8MFFByaXZhdGUgT3JnYW5pemF0aW9uMRMwEQYLKwYBBAGCNzwCAQMTAkpQMRcwFQYDVQQFEw4wMTAwLTAxLTAwODgyNDELMAkGA1UEBhMCSlAxDjAMBgNVBAgTBVRva3lvMRMwEQYDVQQHEwpDaGl5b2RhLUt1MTkwNwYDVQQKEzBUb2tpbyBNYXJpbmUgYW5kIE5pY2hpZG8gRmlyZSBJbnN1cmFuY2UgQ28uIEx0ZC4xGDAWBgNVBAMTD3d3dy50YWJpa29yZS5qcDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAN+0RFcFIoCHvFTJs/+hexC5SxrKDAytiHNDzXLYaOFuf2LA+7UN33QE8dnINmV0ma7Udd1r8KmXJWJPeTxIJyskad8VNwx0oF00ENS56GYl/y37Y85DE5MQhaQwPEiyQL0TsrL/K2bNYjvEPklBVEOi1vtiOOTZWnUH86MxSe3PwmmXDaFgd3174Z8lEmi20Jl3++Tr/jNeBMw3Sg3KuLW8IUTl6+33mr3Z1u2u6yFN4d7mXlzyo0BxOwlJ1NwJbTzyFnBAfAZ2gJFVFQtuoWdgh9XIquhdFoxCfj/h9zxFK+64xJ+sXGSL5SiEZeBfmvG8SrW4OBSvHzyUSzJKCrsCAwEAAaOCAv4wggL6MB8GA1UdIwQYMBaAFD3TUKXWoK3u80pgCmXTIdT4+NYPMB0GA1UdDgQWBBQKix8NngHND9LiEWxMPAOBE6MwjDAnBgNVHREEIDAeggt0YWJpa29yZS5qcIIPd3d3LnRhYmlrb3JlLmpwMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDBLBgNVHSAERDBCMDcGCWCGSAGG/WwCATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20vQ1BTMAcGBWeBDAEBMIGIBggrBgEFBQcBAQR8MHowJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBSBggrBgEFBQcwAoZGaHR0cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0U0hBMkV4dGVuZGVkVmFsaWRhdGlvblNlcnZlckNBLmNydDAJBgNVHRMEAjAAMIIBBAYKKwYBBAHWeQIEAgSB9QSB8gDwAHYA9lyUL9F3MCIUVBgIMJRWjuNNExkzv98MLyALzE7xZOMAAAF093gqNAAABAMARzBFAiEAz0WGut1b8na4VKfulIqCPRbV+lv05YdPNT2xfWreNAYCIDU3JiavbsMjE/r0M9P2c7B07U72W4TK/PdlsKCg5t1PAHYAXNxDkv7mq0VEsV6a1FbmEDf71fpH3KFzlLJe5vbHDsoAAAF093gqgwAABAMARzBFAiApVQum+1q4C4drBI7t6aObwa5jtmWd/BHVTLPgcdhMXgIhAKv+7bC9X9wstKB0OGQbVVX/qsJ5fzf4Y8zNUaklAQiKMA0GCSqGSIb3DQEBCwUAA4IBAQAD02pESpGPgJSMTpFVm4VRufgwW95fxA/sch63U94owcOmNtrniSoOr8QwLMAVta6VFU6wddbTBd4vz8zauo4R6uAeFaiUBaFaKb5V2bONGclfjTZ7nsDxsowLracGrRx/rQjjovRo2656g5Iu898WIfADxIvsGc5CICGqLB9GvofVWNNb/DoOXf/vLQJj9m5+ZCi0CrIdh31IB/acHsQ8jWr4VlqPGiz2PIdKjBLuI9ckFbMQ/9DCTWfuJhSfwA3kk2EeUa6WlRrjDhJLasjrEmQiSIf3oywdsPspSYOkT91TFUvzjOmK/yZeApxPJmDvjxpum5GZYnn6QthKxMzL', 'received_headers': [ 'Content-Length: 278', 'Content-Type: text/html', 'Date: Fri, 06 Nov 2020 20:24:19 GMT', 'Expires: Fri, 06 Nov 2020 20:24:19 GMT', 'Mime-Version: 1.0', 'Server: AkamaiGHost', 'Set-Cookie: bm_sz=6A1BDB4DFCA371F55C598A6D50C7DC3F~YAAQtTXdWKzJ+ZR1AQAA6zY7nwmc3d1xb2D5pqi3WHoMGfNsB8zB22LP5Kz/15sxdI3d3qznv4NzhGdb6CjijzFezAd18NREhybEvZMSZe2JHkjBjli/y1ZRMgC512ln7CCHURjS03UWDIzVrpwPV3Z/h/mq00NF2+LgHsDPelEZoArYVmEwH7OtE4zHAePErKw=; Domain=.discover.com; Path=/; Expires=Sat, 07 Nov 2020 00:24:19 GMT; Max-Age=14400; HttpOnly_abck=7A29878FA7120EC680C6E591A8FF3F5A~-1~YAAQtTXdWK3J+ZR1AQAA6zY7nwR93cThkIxWHn0icKtS6Wgb6NVHSQ80nZ6I2DzczA+1qn/0rXSGZUcFvW/+7tmDF0lHsieeRwnmIydhPELwAsNLjfBMF1lJm9Y7u4ppUpD4WtbRJ1g+Qhd9CLcelH3fQ8AVmJn/jRNN8WrisA8GKuUhpfjv9Gp1aGGqzv12H8u3Ogt/9oOv4Y8nKuS7CWipsFuPftCMeTBVbPn/JsV/NzttmkuFikLj8PwmpNecqlhaH1Ra32XDl/hVsCFWaPm4wdWO3d2WDK8Em0sHzklyTV4iFo6itVlCEHQ=~-1~-1~-1; Domain=.discover.com; Path=/; Expires=Sat, 06 Nov 2021 20:24:19 GMT; Max-Age=31536000; Secure' ], 'blockpage': False, 'page_signature': 'x_on_this_server', } # yapf: enable blockpage_matcher = BlockpageMatcher() parsed = flatten_base.parse_received_data(blockpage_matcher, received, True) self.assertDictEqual(parsed, expected)
def _process_hyperquack_v2(self, filename: str, scan: Any, random_measurement_id: str) -> Iterator[Row]: """Process a line of Echo/Discard/HTTP/S data in HyperQuack V2 format. https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv2.rst Args: filename: a filepath string scan: a loaded json object containing the parsed content of the line random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ for response in scan.get('response', []): date = response['start_time'][:10] domain: str = response.get('control_url', scan['test_url']) is_control = 'control_url' in response row = { 'domain': domain, 'category': self.category_matcher.get_category(domain, is_control), 'ip': scan['vp'], 'date': date, 'start_time': response['start_time'], 'end_time': response['end_time'], 'anomaly': scan['anomaly'], 'success': response['matches_template'], 'stateful_block': scan['stateful_block'], 'is_control': is_control, 'controls_failed': scan.get('controls_failed', None), 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filename), } if 'response' in response: received = response.get('response', '') received_fields = flatten_base.parse_received_data( self.blockpage_matcher, received, scan['anomaly']) row.update(received_fields) if 'error' in response: row['error'] = response['error'] yield row
def test_parse_received_data_no_header_field(self) -> None: """Test parsing reciveed HTTP/S data missing a header field.""" received = { 'status_line': '403 Forbidden', 'body': '<test-body>' # No 'headers' field } expected = { 'received_status': '403 Forbidden', 'received_body': '<test-body>', 'received_headers': [], 'blockpage': None, 'page_signature': None, } blockpage_matcher = BlockpageMatcher() parsed = flatten_base.parse_received_data(blockpage_matcher, received, True) self.assertDictEqual(parsed, expected)
def test_parse_received_data_http_status_line_false_positive(self) -> None: """Test parsing sample HTTP data with a false positive match in the status line.""" received = { 'status_line': '521 Origin Down', 'headers': {}, 'body': '<html><head></title></head><body>test/body></html>' } expected = { 'received_status': '521 Origin Down', 'received_body': '<html><head></title></head><body>test/body></html>', 'received_headers': [], 'blockpage': False, 'page_signature': 'x_521', } blockpage_matcher = BlockpageMatcher() parsed = flatten_base.parse_received_data(blockpage_matcher, received, True) self.assertDictEqual(parsed, expected)
def test_parse_received_data_http(self) -> None: """Test parsing sample HTTP data.""" # yapf: disable received = { 'status_line': '403 Forbidden', 'headers': {}, 'body': '<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1256"><title>MNN3-1(1)</title></head><body><iframe src="http://10.10.34.35:80" style="width: 100%; height: 100%" scrolling="no" marginwidth="0" marginheight="0" frameborder="0" vspace="0" hspace="0"></iframe></body></html>\r\n\r\n' } expected = { 'received_status': '403 Forbidden', 'received_body': '<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1256"><title>MNN3-1(1)</title></head><body><iframe src="http://10.10.34.35:80" style="width: 100%; height: 100%" scrolling="no" marginwidth="0" marginheight="0" frameborder="0" vspace="0" hspace="0"></iframe></body></html>\r\n\r\n', 'received_headers': [], 'blockpage': True, 'page_signature': 'b_nat_ir_national_1', } # yapf: enable blockpage_matcher = BlockpageMatcher() parsed = flatten_base.parse_received_data(blockpage_matcher, received, True) self.assertDictEqual(parsed, expected)
def test_parse_received_data_http_header_false_positive(self) -> None: """Test parsing sample HTTP data with a blockpage match in the header.""" received = { 'status_line': '403 Forbidden', 'headers': { 'Server': ['Barracuda/NGFirewall'] }, 'body': '<html><head></title></head><body>test/body></html>' } expected = { 'received_status': '403 Forbidden', 'received_body': '<html><head></title></head><body>test/body></html>', 'received_headers': ['Server: Barracuda/NGFirewall'], 'blockpage': True, 'page_signature': 'a_prod_barracuda_2', } blockpage_matcher = BlockpageMatcher() parsed = flatten_base.parse_received_data(blockpage_matcher, received, True) self.assertDictEqual(parsed, expected)