def _process_hyperquack_v1(self, filename: str, scan: Any,
                               random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Echo/Discard/HTTP/S data in HyperQuack V1 format.

    https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv1.rst

    Args:
      filename: a filepath string
      scan: a loaded json object containing the parsed content of the line
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        for index, result in enumerate(scan.get('Results', [])):
            date = result['StartTime'][:10]

            sent_domain = _extract_domain_from_sent_field(result['Sent'])
            is_control = flatten_base.is_control_url(sent_domain)
            # Due to a bug the sent field sometimes isn't populated
            # when the measurement failed due to network timeout.
            if not sent_domain:
                # Control measurements come at the end, and are not counted as retries.
                is_control = index > scan['Retries']
                if is_control:
                    domain = ""
                else:
                    domain = scan['Keyword']
            else:
                domain = sent_domain

            row = {
                'domain': domain,
                'category':
                self.category_matcher.get_category(domain, is_control),
                'ip': scan['Server'],
                'date': date,
                'start_time': result['StartTime'],
                'end_time': result['EndTime'],
                'anomaly': scan['Blocked'],
                'success': result['Success'],
                'stateful_block': scan['StatefulBlock'],
                'is_control': is_control,
                'controls_failed': scan['FailSanity'],
                'measurement_id': random_measurement_id,
                'source': flatten_base.source_from_filename(filename),
            }

            if 'Received' in result:
                received = result.get('Received', '')
                received_fields = flatten_base.parse_received_data(
                    self.blockpage_matcher, received, scan['Blocked'])
                row.update(received_fields)

            if 'Error' in result:
                row['error'] = result['Error']

            yield row
    def _process_satellite_blockpages(self, scan: Any,
                                      filepath: str) -> Iterator[Row]:
        """Process a line of Satellite blockpage data.

    Args:
      scan: a loaded json object containing the parsed content of the line
      filepath: a filepath string

    Yields:
      Rows, usually 2 corresponding to the fetched http and https data respectively
    """
        row = {
            'domain': scan['keyword'],
            'ip': scan['ip'],
            'date': scan['start_time'][:10],
            'start_time': format_timestamp(scan['start_time']),
            'end_time': format_timestamp(scan['end_time']),
            'success': scan['fetched'],
            'source': flatten_base.source_from_filename(filepath),
        }

        http = {
            'https': False,
        }
        http.update(row)
        received_fields = flatten_base.parse_received_data(
            self.blockpage_matcher, scan.get('http', ''), True)
        http.update(received_fields)
        yield http

        https = {
            'https': True,
        }
        https.update(row)
        received_fields = flatten_base.parse_received_data(
            self.blockpage_matcher, scan.get('https', ''), True)
        https.update(received_fields)
        yield https
    def test_parse_received_data_https(self) -> None:
        """Test parsing example HTTPS data."""
        # yapf: disable
        received = {
            'status_line': '403 Forbidden',
            'headers': {
                'Content-Length': ['278'],
                'Content-Type': ['text/html'],
                'Date': ['Fri, 06 Nov 2020 20:24:19 GMT'],
                'Expires': ['Fri, 06 Nov 2020 20:24:19 GMT'],
                'Mime-Version': ['1.0'],
                'Server': ['AkamaiGHost'],
                'Set-Cookie': [
                    'bm_sz=6A1BDB4DFCA371F55C598A6D50C7DC3F~YAAQtTXdWKzJ+ZR1AQAA6zY7nwmc3d1xb2D5pqi3WHoMGfNsB8zB22LP5Kz/15sxdI3d3qznv4NzhGdb6CjijzFezAd18NREhybEvZMSZe2JHkjBjli/y1ZRMgC512ln7CCHURjS03UWDIzVrpwPV3Z/h/mq00NF2+LgHsDPelEZoArYVmEwH7OtE4zHAePErKw=; Domain=.discover.com; Path=/; Expires=Sat, 07 Nov 2020 00:24:19 GMT; Max-Age=14400; HttpOnly_abck=7A29878FA7120EC680C6E591A8FF3F5A~-1~YAAQtTXdWK3J+ZR1AQAA6zY7nwR93cThkIxWHn0icKtS6Wgb6NVHSQ80nZ6I2DzczA+1qn/0rXSGZUcFvW/+7tmDF0lHsieeRwnmIydhPELwAsNLjfBMF1lJm9Y7u4ppUpD4WtbRJ1g+Qhd9CLcelH3fQ8AVmJn/jRNN8WrisA8GKuUhpfjv9Gp1aGGqzv12H8u3Ogt/9oOv4Y8nKuS7CWipsFuPftCMeTBVbPn/JsV/NzttmkuFikLj8PwmpNecqlhaH1Ra32XDl/hVsCFWaPm4wdWO3d2WDK8Em0sHzklyTV4iFo6itVlCEHQ=~-1~-1~-1; Domain=.discover.com; Path=/; Expires=Sat, 06 Nov 2021 20:24:19 GMT; Max-Age=31536000; Secure'
                ]
            },
            'body': '<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "discover.com" on this server.<P>\nReference 18b535dd581604694259a71c660\n</BODY>\n</HTML>\n',
            'tls': {
                'version': 771,
                'cipher_suite': 49199,
                'cert': 'MIIG1DCCBbygAwIBAgIQBFzDKr18mq0F13LVYhA6FjANBgkqhkiG9w0BAQsFADB1MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTQwMgYDVQQDEytEaWdpQ2VydCBTSEEyIEV4dGVuZGVkIFZhbGlkYXRpb24gU2VydmVyIENBMB4XDTIwMTAwNTAwMDAwMFoXDTIxMTAwNjEyMDAwMFowgdQxHTAbBgNVBA8MFFByaXZhdGUgT3JnYW5pemF0aW9uMRMwEQYLKwYBBAGCNzwCAQMTAkpQMRcwFQYDVQQFEw4wMTAwLTAxLTAwODgyNDELMAkGA1UEBhMCSlAxDjAMBgNVBAgTBVRva3lvMRMwEQYDVQQHEwpDaGl5b2RhLUt1MTkwNwYDVQQKEzBUb2tpbyBNYXJpbmUgYW5kIE5pY2hpZG8gRmlyZSBJbnN1cmFuY2UgQ28uIEx0ZC4xGDAWBgNVBAMTD3d3dy50YWJpa29yZS5qcDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAN+0RFcFIoCHvFTJs/+hexC5SxrKDAytiHNDzXLYaOFuf2LA+7UN33QE8dnINmV0ma7Udd1r8KmXJWJPeTxIJyskad8VNwx0oF00ENS56GYl/y37Y85DE5MQhaQwPEiyQL0TsrL/K2bNYjvEPklBVEOi1vtiOOTZWnUH86MxSe3PwmmXDaFgd3174Z8lEmi20Jl3++Tr/jNeBMw3Sg3KuLW8IUTl6+33mr3Z1u2u6yFN4d7mXlzyo0BxOwlJ1NwJbTzyFnBAfAZ2gJFVFQtuoWdgh9XIquhdFoxCfj/h9zxFK+64xJ+sXGSL5SiEZeBfmvG8SrW4OBSvHzyUSzJKCrsCAwEAAaOCAv4wggL6MB8GA1UdIwQYMBaAFD3TUKXWoK3u80pgCmXTIdT4+NYPMB0GA1UdDgQWBBQKix8NngHND9LiEWxMPAOBE6MwjDAnBgNVHREEIDAeggt0YWJpa29yZS5qcIIPd3d3LnRhYmlrb3JlLmpwMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDBLBgNVHSAERDBCMDcGCWCGSAGG/WwCATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20vQ1BTMAcGBWeBDAEBMIGIBggrBgEFBQcBAQR8MHowJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBSBggrBgEFBQcwAoZGaHR0cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0U0hBMkV4dGVuZGVkVmFsaWRhdGlvblNlcnZlckNBLmNydDAJBgNVHRMEAjAAMIIBBAYKKwYBBAHWeQIEAgSB9QSB8gDwAHYA9lyUL9F3MCIUVBgIMJRWjuNNExkzv98MLyALzE7xZOMAAAF093gqNAAABAMARzBFAiEAz0WGut1b8na4VKfulIqCPRbV+lv05YdPNT2xfWreNAYCIDU3JiavbsMjE/r0M9P2c7B07U72W4TK/PdlsKCg5t1PAHYAXNxDkv7mq0VEsV6a1FbmEDf71fpH3KFzlLJe5vbHDsoAAAF093gqgwAABAMARzBFAiApVQum+1q4C4drBI7t6aObwa5jtmWd/BHVTLPgcdhMXgIhAKv+7bC9X9wstKB0OGQbVVX/qsJ5fzf4Y8zNUaklAQiKMA0GCSqGSIb3DQEBCwUAA4IBAQAD02pESpGPgJSMTpFVm4VRufgwW95fxA/sch63U94owcOmNtrniSoOr8QwLMAVta6VFU6wddbTBd4vz8zauo4R6uAeFaiUBaFaKb5V2bONGclfjTZ7nsDxsowLracGrRx/rQjjovRo2656g5Iu898WIfADxIvsGc5CICGqLB9GvofVWNNb/DoOXf/vLQJj9m5+ZCi0CrIdh31IB/acHsQ8jWr4VlqPGiz2PIdKjBLuI9ckFbMQ/9DCTWfuJhSfwA3kk2EeUa6WlRrjDhJLasjrEmQiSIf3oywdsPspSYOkT91TFUvzjOmK/yZeApxPJmDvjxpum5GZYnn6QthKxMzL'
            }
        }

        expected = {
            'received_status': '403 Forbidden',
            'received_body': '<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "discover.com" on this server.<P>\nReference 18b535dd581604694259a71c660\n</BODY>\n</HTML>\n',
            'received_tls_version': 771,
            'received_tls_cipher_suite': 49199,
            'received_tls_cert': 'MIIG1DCCBbygAwIBAgIQBFzDKr18mq0F13LVYhA6FjANBgkqhkiG9w0BAQsFADB1MQswCQYDVQQGEwJVUzEVMBMGA1UEChMMRGlnaUNlcnQgSW5jMRkwFwYDVQQLExB3d3cuZGlnaWNlcnQuY29tMTQwMgYDVQQDEytEaWdpQ2VydCBTSEEyIEV4dGVuZGVkIFZhbGlkYXRpb24gU2VydmVyIENBMB4XDTIwMTAwNTAwMDAwMFoXDTIxMTAwNjEyMDAwMFowgdQxHTAbBgNVBA8MFFByaXZhdGUgT3JnYW5pemF0aW9uMRMwEQYLKwYBBAGCNzwCAQMTAkpQMRcwFQYDVQQFEw4wMTAwLTAxLTAwODgyNDELMAkGA1UEBhMCSlAxDjAMBgNVBAgTBVRva3lvMRMwEQYDVQQHEwpDaGl5b2RhLUt1MTkwNwYDVQQKEzBUb2tpbyBNYXJpbmUgYW5kIE5pY2hpZG8gRmlyZSBJbnN1cmFuY2UgQ28uIEx0ZC4xGDAWBgNVBAMTD3d3dy50YWJpa29yZS5qcDCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAN+0RFcFIoCHvFTJs/+hexC5SxrKDAytiHNDzXLYaOFuf2LA+7UN33QE8dnINmV0ma7Udd1r8KmXJWJPeTxIJyskad8VNwx0oF00ENS56GYl/y37Y85DE5MQhaQwPEiyQL0TsrL/K2bNYjvEPklBVEOi1vtiOOTZWnUH86MxSe3PwmmXDaFgd3174Z8lEmi20Jl3++Tr/jNeBMw3Sg3KuLW8IUTl6+33mr3Z1u2u6yFN4d7mXlzyo0BxOwlJ1NwJbTzyFnBAfAZ2gJFVFQtuoWdgh9XIquhdFoxCfj/h9zxFK+64xJ+sXGSL5SiEZeBfmvG8SrW4OBSvHzyUSzJKCrsCAwEAAaOCAv4wggL6MB8GA1UdIwQYMBaAFD3TUKXWoK3u80pgCmXTIdT4+NYPMB0GA1UdDgQWBBQKix8NngHND9LiEWxMPAOBE6MwjDAnBgNVHREEIDAeggt0YWJpa29yZS5qcIIPd3d3LnRhYmlrb3JlLmpwMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUHAwIwdQYDVR0fBG4wbDA0oDKgMIYuaHR0cDovL2NybDMuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDA0oDKgMIYuaHR0cDovL2NybDQuZGlnaWNlcnQuY29tL3NoYTItZXYtc2VydmVyLWczLmNybDBLBgNVHSAERDBCMDcGCWCGSAGG/WwCATAqMCgGCCsGAQUFBwIBFhxodHRwczovL3d3dy5kaWdpY2VydC5jb20vQ1BTMAcGBWeBDAEBMIGIBggrBgEFBQcBAQR8MHowJAYIKwYBBQUHMAGGGGh0dHA6Ly9vY3NwLmRpZ2ljZXJ0LmNvbTBSBggrBgEFBQcwAoZGaHR0cDovL2NhY2VydHMuZGlnaWNlcnQuY29tL0RpZ2lDZXJ0U0hBMkV4dGVuZGVkVmFsaWRhdGlvblNlcnZlckNBLmNydDAJBgNVHRMEAjAAMIIBBAYKKwYBBAHWeQIEAgSB9QSB8gDwAHYA9lyUL9F3MCIUVBgIMJRWjuNNExkzv98MLyALzE7xZOMAAAF093gqNAAABAMARzBFAiEAz0WGut1b8na4VKfulIqCPRbV+lv05YdPNT2xfWreNAYCIDU3JiavbsMjE/r0M9P2c7B07U72W4TK/PdlsKCg5t1PAHYAXNxDkv7mq0VEsV6a1FbmEDf71fpH3KFzlLJe5vbHDsoAAAF093gqgwAABAMARzBFAiApVQum+1q4C4drBI7t6aObwa5jtmWd/BHVTLPgcdhMXgIhAKv+7bC9X9wstKB0OGQbVVX/qsJ5fzf4Y8zNUaklAQiKMA0GCSqGSIb3DQEBCwUAA4IBAQAD02pESpGPgJSMTpFVm4VRufgwW95fxA/sch63U94owcOmNtrniSoOr8QwLMAVta6VFU6wddbTBd4vz8zauo4R6uAeFaiUBaFaKb5V2bONGclfjTZ7nsDxsowLracGrRx/rQjjovRo2656g5Iu898WIfADxIvsGc5CICGqLB9GvofVWNNb/DoOXf/vLQJj9m5+ZCi0CrIdh31IB/acHsQ8jWr4VlqPGiz2PIdKjBLuI9ckFbMQ/9DCTWfuJhSfwA3kk2EeUa6WlRrjDhJLasjrEmQiSIf3oywdsPspSYOkT91TFUvzjOmK/yZeApxPJmDvjxpum5GZYnn6QthKxMzL',
            'received_headers': [
                'Content-Length: 278',
                'Content-Type: text/html',
                'Date: Fri, 06 Nov 2020 20:24:19 GMT',
                'Expires: Fri, 06 Nov 2020 20:24:19 GMT',
                'Mime-Version: 1.0',
                'Server: AkamaiGHost',
                'Set-Cookie: bm_sz=6A1BDB4DFCA371F55C598A6D50C7DC3F~YAAQtTXdWKzJ+ZR1AQAA6zY7nwmc3d1xb2D5pqi3WHoMGfNsB8zB22LP5Kz/15sxdI3d3qznv4NzhGdb6CjijzFezAd18NREhybEvZMSZe2JHkjBjli/y1ZRMgC512ln7CCHURjS03UWDIzVrpwPV3Z/h/mq00NF2+LgHsDPelEZoArYVmEwH7OtE4zHAePErKw=; Domain=.discover.com; Path=/; Expires=Sat, 07 Nov 2020 00:24:19 GMT; Max-Age=14400; HttpOnly_abck=7A29878FA7120EC680C6E591A8FF3F5A~-1~YAAQtTXdWK3J+ZR1AQAA6zY7nwR93cThkIxWHn0icKtS6Wgb6NVHSQ80nZ6I2DzczA+1qn/0rXSGZUcFvW/+7tmDF0lHsieeRwnmIydhPELwAsNLjfBMF1lJm9Y7u4ppUpD4WtbRJ1g+Qhd9CLcelH3fQ8AVmJn/jRNN8WrisA8GKuUhpfjv9Gp1aGGqzv12H8u3Ogt/9oOv4Y8nKuS7CWipsFuPftCMeTBVbPn/JsV/NzttmkuFikLj8PwmpNecqlhaH1Ra32XDl/hVsCFWaPm4wdWO3d2WDK8Em0sHzklyTV4iFo6itVlCEHQ=~-1~-1~-1; Domain=.discover.com; Path=/; Expires=Sat, 06 Nov 2021 20:24:19 GMT; Max-Age=31536000; Secure'
            ],
            'blockpage': False,
            'page_signature': 'x_on_this_server',
        }
        # yapf: enable
        blockpage_matcher = BlockpageMatcher()
        parsed = flatten_base.parse_received_data(blockpage_matcher, received,
                                                  True)
        self.assertDictEqual(parsed, expected)
    def _process_hyperquack_v2(self, filename: str, scan: Any,
                               random_measurement_id: str) -> Iterator[Row]:
        """Process a line of Echo/Discard/HTTP/S data in HyperQuack V2 format.

    https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv2.rst

    Args:
      filename: a filepath string
      scan: a loaded json object containing the parsed content of the line
      random_measurement_id: a hex id identifying this individual measurement

    Yields:
      Rows
    """
        for response in scan.get('response', []):
            date = response['start_time'][:10]
            domain: str = response.get('control_url', scan['test_url'])
            is_control = 'control_url' in response

            row = {
                'domain': domain,
                'category':
                self.category_matcher.get_category(domain, is_control),
                'ip': scan['vp'],
                'date': date,
                'start_time': response['start_time'],
                'end_time': response['end_time'],
                'anomaly': scan['anomaly'],
                'success': response['matches_template'],
                'stateful_block': scan['stateful_block'],
                'is_control': is_control,
                'controls_failed': scan.get('controls_failed', None),
                'measurement_id': random_measurement_id,
                'source': flatten_base.source_from_filename(filename),
            }

            if 'response' in response:
                received = response.get('response', '')
                received_fields = flatten_base.parse_received_data(
                    self.blockpage_matcher, received, scan['anomaly'])
                row.update(received_fields)

            if 'error' in response:
                row['error'] = response['error']

            yield row
    def test_parse_received_data_no_header_field(self) -> None:
        """Test parsing reciveed HTTP/S data missing a header field."""
        received = {
            'status_line': '403 Forbidden',
            'body': '<test-body>'
            # No 'headers' field
        }

        expected = {
            'received_status': '403 Forbidden',
            'received_body': '<test-body>',
            'received_headers': [],
            'blockpage': None,
            'page_signature': None,
        }
        blockpage_matcher = BlockpageMatcher()
        parsed = flatten_base.parse_received_data(blockpage_matcher, received,
                                                  True)
        self.assertDictEqual(parsed, expected)
    def test_parse_received_data_http_status_line_false_positive(self) -> None:
        """Test parsing sample HTTP data with a false positive match in the status line."""
        received = {
            'status_line': '521 Origin Down',
            'headers': {},
            'body': '<html><head></title></head><body>test/body></html>'
        }

        expected = {
            'received_status': '521 Origin Down',
            'received_body':
            '<html><head></title></head><body>test/body></html>',
            'received_headers': [],
            'blockpage': False,
            'page_signature': 'x_521',
        }
        blockpage_matcher = BlockpageMatcher()
        parsed = flatten_base.parse_received_data(blockpage_matcher, received,
                                                  True)
        self.assertDictEqual(parsed, expected)
    def test_parse_received_data_http(self) -> None:
        """Test parsing sample HTTP data."""
        # yapf: disable
        received = {
            'status_line': '403 Forbidden',
            'headers': {},
            'body': '<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1256"><title>MNN3-1(1)</title></head><body><iframe src="http://10.10.34.35:80" style="width: 100%; height: 100%" scrolling="no" marginwidth="0" marginheight="0" frameborder="0" vspace="0" hspace="0"></iframe></body></html>\r\n\r\n'
        }

        expected = {
            'received_status': '403 Forbidden',
            'received_body': '<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1256"><title>MNN3-1(1)</title></head><body><iframe src="http://10.10.34.35:80" style="width: 100%; height: 100%" scrolling="no" marginwidth="0" marginheight="0" frameborder="0" vspace="0" hspace="0"></iframe></body></html>\r\n\r\n',
            'received_headers': [],
            'blockpage': True,
            'page_signature': 'b_nat_ir_national_1',
        }
        # yapf: enable
        blockpage_matcher = BlockpageMatcher()
        parsed = flatten_base.parse_received_data(blockpage_matcher, received,
                                                  True)
        self.assertDictEqual(parsed, expected)
    def test_parse_received_data_http_header_false_positive(self) -> None:
        """Test parsing sample HTTP data with a blockpage match in the header."""
        received = {
            'status_line': '403 Forbidden',
            'headers': {
                'Server': ['Barracuda/NGFirewall']
            },
            'body': '<html><head></title></head><body>test/body></html>'
        }

        expected = {
            'received_status': '403 Forbidden',
            'received_body':
            '<html><head></title></head><body>test/body></html>',
            'received_headers': ['Server: Barracuda/NGFirewall'],
            'blockpage': True,
            'page_signature': 'a_prod_barracuda_2',
        }
        blockpage_matcher = BlockpageMatcher()
        parsed = flatten_base.parse_received_data(blockpage_matcher, received,
                                                  True)
        self.assertDictEqual(parsed, expected)