def test_source_from_filename(self) -> None: """Test getting the data source name from the filename.""" self.assertEqual( flatten_base.source_from_filename( 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json' ), 'CP_Quack-echo-2020-08-23-06-01-02') self.assertEqual( flatten_base.source_from_filename( 'gs://firehook-scans/http/CP_Quack-http-2020-09-13-01-02-07/results.json' ), 'CP_Quack-http-2020-09-13-01-02-07')
def _process_hyperquack_v1(self, filename: str, scan: Any, random_measurement_id: str) -> Iterator[Row]: """Process a line of Echo/Discard/HTTP/S data in HyperQuack V1 format. https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv1.rst Args: filename: a filepath string scan: a loaded json object containing the parsed content of the line random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ for index, result in enumerate(scan.get('Results', [])): date = result['StartTime'][:10] sent_domain = _extract_domain_from_sent_field(result['Sent']) is_control = flatten_base.is_control_url(sent_domain) # Due to a bug the sent field sometimes isn't populated # when the measurement failed due to network timeout. if not sent_domain: # Control measurements come at the end, and are not counted as retries. is_control = index > scan['Retries'] if is_control: domain = "" else: domain = scan['Keyword'] else: domain = sent_domain row = { 'domain': domain, 'category': self.category_matcher.get_category(domain, is_control), 'ip': scan['Server'], 'date': date, 'start_time': result['StartTime'], 'end_time': result['EndTime'], 'anomaly': scan['Blocked'], 'success': result['Success'], 'stateful_block': scan['StatefulBlock'], 'is_control': is_control, 'controls_failed': scan['FailSanity'], 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filename), } if 'Received' in result: received = result.get('Received', '') received_fields = flatten_base.parse_received_data( self.blockpage_matcher, received, scan['Blocked']) row.update(received_fields) if 'Error' in result: row['error'] = result['Error'] yield row
def _process_satellite_v2(self, scan: Any, filepath: str, random_measurement_id: str) -> Iterator[Row]: """Process a line of Satellite v2 data. Args: scan: a loaded json object containing the parsed content of the line filepath: path like "<path>/<filename>.json.gz" random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ is_control_domain = flatten_base.is_control_url(scan['test_url']) row = { 'domain': scan['test_url'], 'is_control': is_control_domain, 'category': self.category_matcher.get_category(scan['test_url'], is_control_domain), 'ip': scan['vp'], 'is_control_ip': scan['vp'] in CONTROL_IPS, 'country': scan.get('location', {}).get('country_code'), 'date': scan['start_time'][:10], 'start_time': format_timestamp(scan['start_time']), 'end_time': format_timestamp(scan['end_time']), 'error': scan.get('error', None), 'anomaly': scan['anomaly'], 'success': not scan['connect_error'], 'received': None, 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filepath), } if datetime.date.fromisoformat( row['date']) < SATELLITE_V2_2_START_DATE: yield from _process_satellite_v2p1(row, scan) else: yield from _process_satellite_v2p2(row, scan)
def _data_to_load(self, gcs: GCSFileSystem, scan_type: str, incremental_load: bool, table_name: str, start_date: Optional[datetime.date] = None, end_date: Optional[datetime.date] = None) -> List[str]: """Select the right files to read. Args: gcs: GCSFileSystem object scan_type: one of 'echo', 'discard', 'http', 'https', 'satellite' incremental_load: boolean. If true, only read the latest new data table_name: dataset.table name like 'base.scan_echo' start_date: date object, only files after or at this date will be read end_date: date object, only files at or before this date will be read Returns: A List of filename strings. ex ['gs://firehook-scans/echo/CP_Quack-echo-2020-08-22-06-08-03/results.json', 'gs://firehook-scans/echo/CP_Quack-echo-2020-08-23-06-01-02/results.json'] """ if incremental_load: full_table_name = self._get_full_table_name(table_name) existing_sources = _get_existing_datasources(full_table_name) else: existing_sources = [] if scan_type == satellite.SCAN_TYPE_SATELLITE: files_to_load = flatten_satellite.SATELLITE_FILES else: files_to_load = SCAN_FILES # Filepath like `gs://firehook-scans/echo/**/*' files_regex = f'{self.bucket}{scan_type}/**/*' file_metadata = [m.metadata_list for m in gcs.match([files_regex])][0] filepaths = [metadata.path for metadata in file_metadata] file_sizes = [metadata.size_in_bytes for metadata in file_metadata] filtered_filenames = [ filepath for (filepath, file_size) in zip(filepaths, file_sizes) if (_between_dates(filepath, start_date, end_date) and _filename_matches(filepath, files_to_load) and flatten_base.source_from_filename(filepath) not in existing_sources and file_size > EMPTY_GZIPPED_FILE_SIZE) ] return filtered_filenames
def _process_hyperquack_v2(self, filename: str, scan: Any, random_measurement_id: str) -> Iterator[Row]: """Process a line of Echo/Discard/HTTP/S data in HyperQuack V2 format. https://github.com/censoredplanet/censoredplanet/blob/master/docs/hyperquackv2.rst Args: filename: a filepath string scan: a loaded json object containing the parsed content of the line random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ for response in scan.get('response', []): date = response['start_time'][:10] domain: str = response.get('control_url', scan['test_url']) is_control = 'control_url' in response row = { 'domain': domain, 'category': self.category_matcher.get_category(domain, is_control), 'ip': scan['vp'], 'date': date, 'start_time': response['start_time'], 'end_time': response['end_time'], 'anomaly': scan['anomaly'], 'success': response['matches_template'], 'stateful_block': scan['stateful_block'], 'is_control': is_control, 'controls_failed': scan.get('controls_failed', None), 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filename), } if 'response' in response: received = response.get('response', '') received_fields = flatten_base.parse_received_data( self.blockpage_matcher, received, scan['anomaly']) row.update(received_fields) if 'error' in response: row['error'] = response['error'] yield row
def _process_satellite_v1(self, date: str, scan: Any, filepath: str, random_measurement_id: str) -> Iterator[Row]: """Process a line of Satellite data. Args: date: a date string YYYY-mm-DD scan: a loaded json object containing the parsed content of the line filepath: one of <path>/answers_control.json <path>/interference.json also potentially .gz files random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ filename = pathlib.PurePosixPath(filepath).name if '.gz' in pathlib.PurePosixPath(filename).suffixes: filename = pathlib.PurePosixPath(filename).stem row = { 'domain': scan['query'], 'is_control': False, # v1 doesn't have domain controls 'category': self.category_matcher.get_category(scan['query'], False), 'ip': scan.get('resolver', scan.get('ip')), 'is_control_ip': filename == SATELLITE_ANSWERS_CONTROL_FILE, 'date': date, 'error': scan.get('error', None), 'anomaly': not scan['passed'] if 'passed' in scan else None, 'success': 'error' not in scan, 'received': None, 'rcode': ['0'] if 'error' not in scan else ['-1'], 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filepath), } if isinstance(row['error'], dict): row['error'] = json.dumps(row['error']) received_ips = scan.get('answers') yield from _process_received_ips(row, received_ips)
def _process_satellite_blockpages(self, scan: Any, filepath: str) -> Iterator[Row]: """Process a line of Satellite blockpage data. Args: scan: a loaded json object containing the parsed content of the line filepath: a filepath string Yields: Rows, usually 2 corresponding to the fetched http and https data respectively """ row = { 'domain': scan['keyword'], 'ip': scan['ip'], 'date': scan['start_time'][:10], 'start_time': format_timestamp(scan['start_time']), 'end_time': format_timestamp(scan['end_time']), 'success': scan['fetched'], 'source': flatten_base.source_from_filename(filepath), } http = { 'https': False, } http.update(row) received_fields = flatten_base.parse_received_data( self.blockpage_matcher, scan.get('http', ''), True) http.update(received_fields) yield http https = { 'https': True, } https.update(row) received_fields = flatten_base.parse_received_data( self.blockpage_matcher, scan.get('https', ''), True) https.update(received_fields) yield https
def _process_satellite_v2_control( self, scan: Any, filepath: str, random_measurement_id: str) -> Iterator[Row]: """Process a line of Satellite ip control data. Args: scan: a loaded json object containing the parsed content of the line filepath: path like "<path>/<filename>.json.gz" random_measurement_id: a hex id identifying this individual measurement Yields: Rows """ responses = scan.get('response', []) if responses: # An overall satellite v2 measurement # always contains some non-control trial domains is_control_domain = False row = { 'domain': scan['test_url'], 'is_control': is_control_domain, 'category': self.category_matcher.get_category(scan['test_url'], is_control_domain), 'ip': scan['vp'], 'is_control_ip': True, 'date': responses[0]['start_time'][:10], 'start_time': format_timestamp(responses[0]['start_time']), 'end_time': format_timestamp(responses[-1]['end_time']), 'anomaly': None, 'success': not scan['connect_error'], 'controls_failed': not scan['passed_control'], 'rcode': [str(response['rcode']) for response in responses], 'measurement_id': random_measurement_id, 'source': flatten_base.source_from_filename(filepath), } errors = [ response['error'] for response in responses if response['error'] and response['error'] != 'null' ] row['error'] = ' | '.join(errors) if errors else None for response in responses: if response['url'] == row['domain']: # Check response for test domain if response['rcode'] == 0 and response['has_type_a']: # Valid answers row['has_type_a'] = True # Separate into one answer IP per row for tagging for ip in response['response']: row['received'] = {'ip': ip} yield row.copy()