def test_flatten_measurement_invalid_json(self) -> None: line = 'invalid json' with self.assertLogs(level='WARNING') as cm: rows = list( beam_tables._flatten_measurement('test_filename.json', line)) self.assertEqual( cm.output[0], 'WARNING:root:JSONDecodeError: ' 'Expecting value: line 1 column 1 (char 0)\n' 'Filename: test_filename.json\ninvalid json\n') self.assertEqual(len(rows), 0)
def test_flatten_measurement_http_success(self) -> None: """Test parsing an example successful HTTP measurement Not all measurements recieve any data/errors, in that case the received_ and error fields should not exist and will end up Null in bigquery. """ line = """{ "Server":"170.248.33.11", "Keyword":"scribd.com", "Retries":0, "Results":[ { "Sent":"scribd.com", "Success":true, "StartTime":"2020-11-09T01:10:47.826486107-05:00", "EndTime":"2020-11-09T01:10:47.84869292-05:00" } ], "Blocked":false, "FailSanity":false, "StatefulBlock":false }""" expected_row = { 'domain': 'scribd.com', 'ip': '170.248.33.11', 'date': '2020-11-09', 'start_time': '2020-11-09T01:10:47.826486107-05:00', 'end_time': '2020-11-09T01:10:47.84869292-05:00', 'retries': 0, 'sent': 'scribd.com', 'blocked': False, 'success': True, 'fail_sanity': False, 'stateful_block': False, 'measurement_id': '', 'source': 'CP_Quack-http-2020-11-09-01-02-08', } filename = 'gs://firehook-scans/http/CP_Quack-http-2020-11-09-01-02-08/results.json' row = list(beam_tables._flatten_measurement(filename, line))[0] # We can't test the measurement id because it's random row['measurement_id'] = '' self.assertEqual(row, expected_row)
def test_flatten_measurement_https(self) -> None: """Test parsing an unsuccessful HTTPS measurement.""" line = """{ "Server":"213.175.166.157", "Keyword":"www.arabhra.org", "Retries":2, "Results":[ { "Sent":"www.arabhra.org", "Received":{ "status_line":"302 Found", "headers":{ "Content-Language":["en"], "Content-Type":["text/html; charset=iso-8859-1"], "Date":["Fri, 06 Nov 2020 20:24:21 GMT"], "Location":[ "https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp" ], "Set-Cookie":[ "BIGipServer~IFRMS-WEB~IFRMS-OHS-HTTPS=rd7o00000000000000000000ffffc0a8fde4o4443; expires=Fri, 06-Nov-2020 21:24:21 GMT; path=/; Httponly; Secure", "TS016c74f4=01671efb9a1a400535e215d6f76498a5887425fed793ca942baa75f16076e60e1350988222922fa06fc16f53ef016d9ecd38535fcabf14861525811a7c3459e91086df326f; Path=/" ], "X-Frame-Options":["SAMEORIGIN"] }, "body":"\\u003c!DOCTYPE HTML PUBLIC \\\"-//IETF//DTD HTML 2.0//EN\\\"\\u003e\\n\\u003cHTML\\u003e\\u003cHEAD\\u003e\\n\\u003cTITLE\\u003e302 Found\\u003c/TITLE\\u003e\\n\\u003c/HEAD\\u003e\\u003cBODY\\u003e\\n\\u003cH1\\u003eFound\\u003c/H1\\u003e\\nThe document has moved \\u003cA HREF=\\\"https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp\\\"\\u003ehere\\u003c/A\\u003e.\\u003cP\\u003e\\n\\u003c/BODY\\u003e\\u003c/HTML\\u003e\\n", "tls":{ "version":771, "cipher_suite":49199, "cert":"MIIHLzCCBhegAwIBAgIQDCECYKFMPekAAAAAVNFY9jANBgkqhkiG9w0BAQsFADCBujELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDE0IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEuMCwGA1UEAxMlRW50cnVzdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEwxTTAeFw0yMDA1MTIxMTIzMDNaFw0yMTA1MTIxMTUzMDJaMIHCMQswCQYDVQQGEwJMQjEPMA0GA1UEBxMGQmVpcnV0MRMwEQYLKwYBBAGCNzwCAQMTAkxCMRcwFQYLKwYBBAGCNzwCAQETBkJlaXJ1dDEWMBQGA1UEChMNQmFuayBBdWRpIFNBTDEdMBsGA1UEDxMUUHJpdmF0ZSBPcmdhbml6YXRpb24xDjAMBgNVBAsTBUJBU0FMMQ4wDAYDVQQFEwUxMTM0NzEdMBsGA1UEAxMUam9icy5iYW5rYXVkaS5jb20ubGIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC37LFk2A2Q8xxahyjhOkul8O9Nv5FFp0NkL4qIy2fTUbsz1uWOqQKo0jDS6Inwtb+i84//znY7ed7Uu5LfbPk0Biefkl4ke0d9LZ3fu7y0iQWWUqKGn4YAPDGv3R0y/47XlhHhDaR+D0z7SbmYHx2NQI7fj6iEfEB90PvPhrdDEKHypNoXa5PwOuGSoU0l+yGmuvF5N7/hr82y987pLRjMdJaszs5EM//C+eiyL9mTA8gvOOf3ZHYQ4ITsJpA9I2Q0E6fDQhGS8SDW2ktdZ7z2TIOQsyMuXJKbBeXCgKyjnaX5UWDis8Hpj43CI8Kge32qsqaTKbjf3Mb66nqHrwSdAgMBAAGjggMlMIIDITA5BgNVHREEMjAwghRqb2JzLmJhbmthdWRpLmNvbS5sYoIYd3d3LmpvYnMuYmFua2F1ZGkuY29tLmxiMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdQBVgdTCFpA2AUrqC5tXPFPwwOQ4eHAlCBcvo6odBxPTDAAAAXIIuy40AAAEAwBGMEQCIEByP85HYDmBb/4WK0B6s5L66Owim+Hzf3jiPYvzhw5eAiBsT1ZEn5PuJfBZ9a9Y/TzJ8K9Qx+3+pyJATsPglI4z3AB2AJQgvB6O1Y1siHMfgosiLA3R2k1ebE+UPWHbTi9YTaLCAAABcgi7LlQAAAQDAEcwRQIgOgyG1ORFwA+sDB3cD4fCu25ahSyMi/4d+xvrP+STJxgCIQDXm1WBzc+gQlU/PhpVti+e4j+2MouWIBBvjw3k0/HTtgB2APZclC/RdzAiFFQYCDCUVo7jTRMZM7/fDC8gC8xO8WTjAAABcgi7LqAAAAQDAEcwRQIgaiMkFpZwGZ5Iac/cfTL8v6TbPHUIeSVjTnB1Z2m9gsoCIQCJr+wqJ0UF+FYhxq9ChDfn1Ukg3uVQePrv4WoWNYjOZzAOBgNVHQ8BAf8EBAMCBaAwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMCMGgGCCsGAQUFBwEBBFwwWjAjBggrBgEFBQcwAYYXaHR0cDovL29jc3AuZW50cnVzdC5uZXQwMwYIKwYBBQUHMAKGJ2h0dHA6Ly9haWEuZW50cnVzdC5uZXQvbDFtLWNoYWluMjU2LmNlcjAzBgNVHR8ELDAqMCigJqAkhiJodHRwOi8vY3JsLmVudHJ1c3QubmV0L2xldmVsMW0uY3JsMEoGA1UdIARDMEEwNgYKYIZIAYb6bAoBAjAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAHBgVngQwBATAfBgNVHSMEGDAWgBTD99C1KjCtrw2RIXA5VN28iXDHOjAdBgNVHQ4EFgQUt5uewiz6lN1FGnoOCX/soGsCwoIwCQYDVR0TBAIwADANBgkqhkiG9w0BAQsFAAOCAQEArlnXiyOefAVaQd0jfxtGwzAed4c8EidlBaoebJACR4zlAIFG0r0pXbdHkLZnCkMCL7XvoV+Y27c1I/Tfcket6qr4gDuKKnbUZIdgg8LGU2OklVEfLv1LJi3+tRuGGCfKpzHWoL1FW+3T6YEETGeb1qZrGBE7Its/4WfVAwaBHynSrvdjeQTYuYP8XsvehhfI5PNQbfV3KIH+sOF7sg80C2sIEyxwD+VEfRGeV6nEhJGJdlibAWfNOwQAyRQcGoiVIdLoa9um9UAUugjktJJ/Dk74YyxIf3aX1yjqTANVIuBgSotC8FvUNTmAALL7Ug8fqvJ9sPQhxIataKh/JdrDCQ==" } }, "Success":false, "Error":"Incorrect web response: status lines don't match", "StartTime":"2020-11-06T15:24:21.124508839-05:00", "EndTime":"2020-11-06T15:24:21.812075476-05:00" } ], "Blocked":false, "FailSanity":false, "StatefulBlock":false } """ filename = 'gs://firehook-scans/http/CP_Quack-https-2020-11-06-15-15-31/results.json' # yapf: disable expected_row: beam_tables.Row = { 'domain': 'www.arabhra.org', 'ip': '213.175.166.157', 'date': '2020-11-06', 'start_time': '2020-11-06T15:24:21.124508839-05:00', 'end_time': '2020-11-06T15:24:21.812075476-05:00', 'retries': 2, 'sent': 'www.arabhra.org', 'received_status': '302 Found', # The received_body field in the json has a lot of unicode escapes # but the interpreted string in the output should not. 'received_body': '<!DOCTYPE HTML PUBLIC \"-//IETF//DTD HTML 2.0//EN\">\n<HTML><HEAD>\n<TITLE>302 Found</TITLE>\n</HEAD><BODY>\n<H1>Found</H1>\nThe document has moved <A HREF=\"https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp\">here</A>.<P>\n</BODY></HTML>\n', 'received_tls_version': 771, 'received_tls_cipher_suite': 49199, 'received_tls_cert': 'MIIHLzCCBhegAwIBAgIQDCECYKFMPekAAAAAVNFY9jANBgkqhkiG9w0BAQsFADCBujELMAkGA1UEBhMCVVMxFjAUBgNVBAoTDUVudHJ1c3QsIEluYy4xKDAmBgNVBAsTH1NlZSB3d3cuZW50cnVzdC5uZXQvbGVnYWwtdGVybXMxOTA3BgNVBAsTMChjKSAyMDE0IEVudHJ1c3QsIEluYy4gLSBmb3IgYXV0aG9yaXplZCB1c2Ugb25seTEuMCwGA1UEAxMlRW50cnVzdCBDZXJ0aWZpY2F0aW9uIEF1dGhvcml0eSAtIEwxTTAeFw0yMDA1MTIxMTIzMDNaFw0yMTA1MTIxMTUzMDJaMIHCMQswCQYDVQQGEwJMQjEPMA0GA1UEBxMGQmVpcnV0MRMwEQYLKwYBBAGCNzwCAQMTAkxCMRcwFQYLKwYBBAGCNzwCAQETBkJlaXJ1dDEWMBQGA1UEChMNQmFuayBBdWRpIFNBTDEdMBsGA1UEDxMUUHJpdmF0ZSBPcmdhbml6YXRpb24xDjAMBgNVBAsTBUJBU0FMMQ4wDAYDVQQFEwUxMTM0NzEdMBsGA1UEAxMUam9icy5iYW5rYXVkaS5jb20ubGIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC37LFk2A2Q8xxahyjhOkul8O9Nv5FFp0NkL4qIy2fTUbsz1uWOqQKo0jDS6Inwtb+i84//znY7ed7Uu5LfbPk0Biefkl4ke0d9LZ3fu7y0iQWWUqKGn4YAPDGv3R0y/47XlhHhDaR+D0z7SbmYHx2NQI7fj6iEfEB90PvPhrdDEKHypNoXa5PwOuGSoU0l+yGmuvF5N7/hr82y987pLRjMdJaszs5EM//C+eiyL9mTA8gvOOf3ZHYQ4ITsJpA9I2Q0E6fDQhGS8SDW2ktdZ7z2TIOQsyMuXJKbBeXCgKyjnaX5UWDis8Hpj43CI8Kge32qsqaTKbjf3Mb66nqHrwSdAgMBAAGjggMlMIIDITA5BgNVHREEMjAwghRqb2JzLmJhbmthdWRpLmNvbS5sYoIYd3d3LmpvYnMuYmFua2F1ZGkuY29tLmxiMIIBfQYKKwYBBAHWeQIEAgSCAW0EggFpAWcAdQBVgdTCFpA2AUrqC5tXPFPwwOQ4eHAlCBcvo6odBxPTDAAAAXIIuy40AAAEAwBGMEQCIEByP85HYDmBb/4WK0B6s5L66Owim+Hzf3jiPYvzhw5eAiBsT1ZEn5PuJfBZ9a9Y/TzJ8K9Qx+3+pyJATsPglI4z3AB2AJQgvB6O1Y1siHMfgosiLA3R2k1ebE+UPWHbTi9YTaLCAAABcgi7LlQAAAQDAEcwRQIgOgyG1ORFwA+sDB3cD4fCu25ahSyMi/4d+xvrP+STJxgCIQDXm1WBzc+gQlU/PhpVti+e4j+2MouWIBBvjw3k0/HTtgB2APZclC/RdzAiFFQYCDCUVo7jTRMZM7/fDC8gC8xO8WTjAAABcgi7LqAAAAQDAEcwRQIgaiMkFpZwGZ5Iac/cfTL8v6TbPHUIeSVjTnB1Z2m9gsoCIQCJr+wqJ0UF+FYhxq9ChDfn1Ukg3uVQePrv4WoWNYjOZzAOBgNVHQ8BAf8EBAMCBaAwHQYDVR0lBBYwFAYIKwYBBQUHAwEGCCsGAQUFBwMCMGgGCCsGAQUFBwEBBFwwWjAjBggrBgEFBQcwAYYXaHR0cDovL29jc3AuZW50cnVzdC5uZXQwMwYIKwYBBQUHMAKGJ2h0dHA6Ly9haWEuZW50cnVzdC5uZXQvbDFtLWNoYWluMjU2LmNlcjAzBgNVHR8ELDAqMCigJqAkhiJodHRwOi8vY3JsLmVudHJ1c3QubmV0L2xldmVsMW0uY3JsMEoGA1UdIARDMEEwNgYKYIZIAYb6bAoBAjAoMCYGCCsGAQUFBwIBFhpodHRwOi8vd3d3LmVudHJ1c3QubmV0L3JwYTAHBgVngQwBATAfBgNVHSMEGDAWgBTD99C1KjCtrw2RIXA5VN28iXDHOjAdBgNVHQ4EFgQUt5uewiz6lN1FGnoOCX/soGsCwoIwCQYDVR0TBAIwADANBgkqhkiG9w0BAQsFAAOCAQEArlnXiyOefAVaQd0jfxtGwzAed4c8EidlBaoebJACR4zlAIFG0r0pXbdHkLZnCkMCL7XvoV+Y27c1I/Tfcket6qr4gDuKKnbUZIdgg8LGU2OklVEfLv1LJi3+tRuGGCfKpzHWoL1FW+3T6YEETGeb1qZrGBE7Its/4WfVAwaBHynSrvdjeQTYuYP8XsvehhfI5PNQbfV3KIH+sOF7sg80C2sIEyxwD+VEfRGeV6nEhJGJdlibAWfNOwQAyRQcGoiVIdLoa9um9UAUugjktJJ/Dk74YyxIf3aX1yjqTANVIuBgSotC8FvUNTmAALL7Ug8fqvJ9sPQhxIataKh/JdrDCQ==', 'received_headers': [ 'Content-Language: en', 'Content-Type: text/html; charset=iso-8859-1', 'Date: Fri, 06 Nov 2020 20:24:21 GMT', 'Location: https://jobs.bankaudi.com.lb/OA_HTML/IrcVisitor.jsp', 'Set-Cookie: BIGipServer~IFRMS-WEB~IFRMS-OHS-HTTPS=rd7o00000000000000000000ffffc0a8fde4o4443; expires=Fri, 06-Nov-2020 21:24:21 GMT; path=/; Httponly; Secure', 'Set-Cookie: TS016c74f4=01671efb9a1a400535e215d6f76498a5887425fed793ca942baa75f16076e60e1350988222922fa06fc16f53ef016d9ecd38535fcabf14861525811a7c3459e91086df326f; Path=/', 'X-Frame-Options: SAMEORIGIN', ], 'error': 'Incorrect web response: status lines don\'t match', 'blocked': False, 'success': False, 'fail_sanity': False, 'stateful_block': False, 'measurement_id': '', 'source': 'CP_Quack-https-2020-11-06-15-15-31', } # yapf: enable row = list(beam_tables._flatten_measurement(filename, line))[0] # We can't test the measurement id because it's random row['measurement_id'] = '' self.assertEqual(row, expected_row)
def test_flatten_measurement_http(self) -> None: """Test parsing an unsuccessful HTTP measurement.""" line = """{ "Server":"184.50.171.225", "Keyword":"www.csmonitor.com", "Retries":0, "Results":[ { "Sent":"www.csmonitor.com", "Received":{ "status_line":"301 Moved Permanently", "headers":{ "Content-Length":["0"], "Date":["Sun, 13 Sep 2020 05:10:58 GMT"], "Location":["https://www.csmonitor.com/"], "Server":["HTTP Proxy/1.0"] }, "body":"test body" }, "Success":false, "Error":"Incorrect web response: status lines don't match", "StartTime":"2020-09-13T01:10:57.499263112-04:00", "EndTime":"2020-09-13T01:10:58.077524926-04:00" } ], "Blocked":true, "FailSanity":false, "StatefulBlock":false }""" expected_row = { 'domain': 'www.csmonitor.com', 'ip': '184.50.171.225', 'date': '2020-09-13', 'start_time': '2020-09-13T01:10:57.499263112-04:00', 'end_time': '2020-09-13T01:10:58.077524926-04:00', 'retries': 0, 'sent': 'www.csmonitor.com', 'received_status': '301 Moved Permanently', 'received_body': 'test body', 'received_headers': [ 'Content-Length: 0', 'Date: Sun, 13 Sep 2020 05:10:58 GMT', 'Location: https://www.csmonitor.com/', 'Server: HTTP Proxy/1.0', ], 'error': 'Incorrect web response: status lines don\'t match', 'blocked': True, 'success': False, 'fail_sanity': False, 'stateful_block': False, 'measurement_id': '', 'source': 'CP_Quack-http-2020-09-13-01-02-07', } filename = 'gs://firehook-scans/http/CP_Quack-http-2020-09-13-01-02-07/results.json' row = list(beam_tables._flatten_measurement(filename, line))[0] # We can't test the measurement id because it's random row['measurement_id'] = '' self.assertEqual(row, expected_row)
def test_flatten_measurement_echo(self) -> None: """Test parsing an example Echo measurement.""" line = """{ "Server":"1.2.3.4", "Keyword":"www.example.com", "Retries":1, "Results":[ { "Sent":"GET / HTTP/1.1 Host: www.example.com", "Received":"HTTP/1.1 403 Forbidden", "Success":false, "Error":"Incorrect echo response", "StartTime":"2020-09-20T07:45:09.643770291-04:00", "EndTime":"2020-09-20T07:45:10.088851843-04:00" }, { "Sent":"GET / HTTP/1.1 Host: www.example.com", "Received": "HTTP/1.1 503 Service Unavailable", "Success":false, "Error":"Incorrect echo response", "StartTime":"2020-09-20T07:45:16.170427683-04:00", "EndTime":"2020-09-20T07:45:16.662093893-04:00" } ], "Blocked":true, "FailSanity":false, "StatefulBlock":false }""" expected_rows: List[beam_tables.Row] = [{ 'domain': 'www.example.com', 'ip': '1.2.3.4', 'date': '2020-09-20', 'start_time': '2020-09-20T07:45:09.643770291-04:00', 'end_time': '2020-09-20T07:45:10.088851843-04:00', 'retries': 1, 'sent': 'GET / HTTP/1.1 Host: www.example.com', 'received_status': 'HTTP/1.1 403 Forbidden', 'error': 'Incorrect echo response', 'blocked': True, 'success': False, 'fail_sanity': False, 'stateful_block': False, 'measurement_id': '', 'source': 'CP_Quack-echo-2020-08-23-06-01-02', }, { 'domain': 'www.example.com', 'ip': '1.2.3.4', 'date': '2020-09-20', 'start_time': '2020-09-20T07:45:16.170427683-04:00', 'end_time': '2020-09-20T07:45:16.662093893-04:00', 'retries': 1, 'sent': 'GET / HTTP/1.1 Host: www.example.com', 'received_status': 'HTTP/1.1 503 Service Unavailable', 'error': 'Incorrect echo response', 'blocked': True, 'success': False, 'fail_sanity': False, 'stateful_block': False, 'measurement_id': '', 'source': 'CP_Quack-echo-2020-08-23-06-01-02', }] filename = 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json' rows = list(beam_tables._flatten_measurement(filename, line)) self.assertEqual(len(rows), 2) # Measurement ids should be the same self.assertEqual(rows[0]['measurement_id'], rows[1]['measurement_id']) # But they're randomly generated, # so we can't test them against the full expected rows. rows[0]['measurement_id'] = '' rows[1]['measurement_id'] = '' self.assertListEqual(rows, expected_rows)