def test_skip_already_processed_or_discovered_files( self, mock_fs_factory: Mock, ) -> None: mock_fs = FakeGCSFileSystem() mock_fs.test_add_path( path=GcsfsFilePath.from_bucket_and_blob_name( "recidiviz-456-direct-ingest-state-us-xx", "raw_data/test_file.txt"), local_path=None, ) mock_fs.test_add_path( path=GcsfsFilePath.from_bucket_and_blob_name( "recidiviz-456-direct-ingest-state-us-xx", "raw_data/test_file.csv"), local_path=None, ) mock_fs.test_add_path( path=GcsfsFilePath.from_bucket_and_blob_name( "recidiviz-456-direct-ingest-state-us-xx", "raw_data/skipped.csv", ), local_path=None, ) mock_fs.test_add_path( path=GcsfsFilePath.from_bucket_and_blob_name( "recidiviz-456-direct-ingest-state-us-xx", "raw_data/discovered.csv", ), local_path=None, ) mock_fs_factory.return_value = mock_fs controller = UploadStateFilesToIngestBucketController( paths_with_timestamps=[ ( "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.txt", TODAY, ), ( "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.csv", TODAY, ), ( "recidiviz-456-direct-ingest-state-us-xx/raw_data/skipped.csv", TODAY, ), ( "recidiviz-456-direct-ingest-state-us-xx/raw_data/discovered.csv", TODAY, ), ], project_id="recidiviz-456", region="us_xx", ) result: MultiRequestResultWithSkipped[str, str, str] = controller.do_upload() self.assertListEqual( result.successes, [ "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.txt", "recidiviz-456-direct-ingest-state-us-xx/raw_data/test_file.csv", ], ) self.assertListEqual( result.skipped, [ "recidiviz-456-direct-ingest-state-us-xx/raw_data/skipped.csv", "recidiviz-456-direct-ingest-state-us-xx/raw_data/discovered.csv", ], ) self.assertFalse(self.us_xx_manager.is_instance_paused())
def is_locked(self, name: str) -> bool: """Checks if @param name is locked by checking if file exists. Returns true if locked, false if unlocked""" path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) return self.fs.exists(path)
def set_config_yaml(self, contents: str) -> None: path = GcsfsFilePath.from_absolute_path( f"gs://{self.mock_project_id}-configs/cloud_sql_to_bq_config.yaml") self.fake_gcs.upload_from_string(path=path, contents=contents, content_type="text/yaml")
def gcs_path(filepath: str) -> GcsfsFilePath: return GcsfsFilePath.from_absolute_path( os.path.join("gs://justice_counts", filepath))
def on_file_added(self, path: GcsfsFilePath) -> None: if path.abs_path().startswith( self.controller.ingest_bucket_path.abs_path()): self.controller.handle_file(path, start_ingest=self.can_start_ingest)
def _test_get_local_file(file_path: GcsfsFilePath) -> str: local_path = os.path.join( os.path.realpath(os.path.dirname(os.path.realpath(__file__))), "auth_fixtures" ) return Path(os.path.join(local_path, file_path.abs_path())).read_text()
def test_export_final_existence_validation_failed(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) # This should cause export_and_validate to raise a ValueError mock_fs.exists.return_value = False delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) with pytest.raises(ViewExportValidationError) as e: exporter.export_and_validate( [export_config_one, export_config_two]) # We get an error at the very end of the export chain because even though delegate validations passed, the # final validation failed self.assertIn( 'Validation on path bucket1/US_XX/view1.json failed the metric file export. ' 'Stopping execution here.', str(e.value)) # The delegate exporters validations all passed so we still copy from staging to final delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) # Only one call to the Exists validation made because the first one failed mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), ])
def output_path(self, extension: str) -> GcsfsFilePath: file_name = f'{self.view.view_id}.{extension}' return GcsfsFilePath.from_directory_and_file_name(self.output_directory, file_name)
def default_config_path() -> GcsfsFilePath: return GcsfsFilePath.from_absolute_path( f"gs://{metadata.project_id()}-configs/cloud_sql_to_bq_config.yaml" )
def test_post_process_downloads(self) -> None: result = self.delegate.post_process_downloads( GcsfsFilePath.from_absolute_path("test_bucket/test.txt"), FakeGCSFileSystem(), ) self.assertEqual(result, "test_bucket/test.txt")
def revert_staging_path_to_original(staging_path: GcsfsFilePath) -> GcsfsFilePath: non_staging_relative_path = staging_path.blob_name.lstrip('staging/') return GcsfsFilePath.from_absolute_path(f'{staging_path.bucket_name}/{non_staging_relative_path}')
def process_job() -> Tuple[str, HTTPStatus]: """Processes a single direct ingest file, specified in the provided ingest arguments. """ logging.info("Received request to process direct ingest job: [%s]", request.values) region_code = get_str_param_value("region", request.values) file_path = get_str_param_value("file_path", request.values, preserve_case=True) if not region_code or not file_path: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST gcsfs_path = GcsfsFilePath.from_absolute_path(file_path) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( gcsfs_path.bucket_path).value, ): json_data = request.get_data(as_text=True) ingest_args = _parse_cloud_task_args(json_data) if not ingest_args: raise DirectIngestError( msg="process_job was called with no GcsfsIngestArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(ingest_args, GcsfsIngestArgs): raise DirectIngestError( msg= f"process_job was called with incorrect args type [{type(ingest_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) if gcsfs_path != ingest_args.file_path: raise DirectIngestError( msg=f"Different paths were passed in the url and request body\n" f"url: {gcsfs_path.uri()}\n" f"body: {ingest_args.file_path.uri()}", error_type=DirectIngestErrorType.INPUT_ERROR, ) with monitoring.push_tags( {TagKey.INGEST_TASK_TAG: ingest_args.task_id_tag()}): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=ingest_args.file_path.bucket_path, allow_unlaunched=False, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e try: controller.run_ingest_job_and_kick_scheduler_on_completion( ingest_args) except GCSPseudoLockAlreadyExists as e: logging.warning(str(e)) return str(e), HTTPStatus.CONFLICT return "", HTTPStatus.OK
def raw_data_import() -> Tuple[str, HTTPStatus]: """Imports a single raw direct ingest CSV file from a location in GCS File System to its corresponding raw data table in BQ. """ logging.info("Received request to do direct ingest raw data import: [%s]", request.values) region_code = get_str_param_value("region", request.values) file_path = get_str_param_value("file_path", request.values, preserve_case=True) if not region_code or not file_path: response = f"Bad parameters [{request.values}]" logging.error(response) return response, HTTPStatus.BAD_REQUEST gcsfs_path = GcsfsFilePath.from_absolute_path(file_path) with monitoring.push_region_tag( region_code, ingest_instance=DirectIngestInstance.for_ingest_bucket( gcsfs_path.bucket_path).value, ): json_data = request.get_data(as_text=True) data_import_args = _parse_cloud_task_args(json_data) if not data_import_args: raise DirectIngestError( msg= "raw_data_import was called with no GcsfsRawDataBQImportArgs.", error_type=DirectIngestErrorType.INPUT_ERROR, ) if not isinstance(data_import_args, GcsfsRawDataBQImportArgs): raise DirectIngestError( msg= f"raw_data_import was called with incorrect args type [{type(data_import_args)}].", error_type=DirectIngestErrorType.INPUT_ERROR, ) if gcsfs_path != data_import_args.raw_data_file_path: raise DirectIngestError( msg=f"Different paths were passed in the url and request body\n" f"url: {gcsfs_path.uri()}\n" f"body: {data_import_args.raw_data_file_path.uri()}", error_type=DirectIngestErrorType.INPUT_ERROR, ) with monitoring.push_tags( {TagKey.RAW_DATA_IMPORT_TAG: data_import_args.task_id_tag()}): try: controller = DirectIngestControllerFactory.build( ingest_bucket_path=data_import_args.raw_data_file_path. bucket_path, allow_unlaunched=False, ) except DirectIngestError as e: if e.is_bad_request(): logging.error(str(e)) return str(e), HTTPStatus.BAD_REQUEST raise e controller.do_raw_data_import(data_import_args) return "", HTTPStatus.OK
def is_file(self, path: str) -> bool: try: file = GcsfsFilePath.from_absolute_path(path) return self.exists(file) except ValueError: return False
def test_direct_ingest_file_moves_with_file_types(self) -> None: self.fully_process_file( datetime.datetime.now(), GcsfsFilePath(bucket_name="my_bucket", blob_name="test_file.csv"), )
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/US_XX"), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/US_XX"), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path("json"), export_config_two_staging.output_path("json"), ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call export_views_with_exporters( mock_fs, [export_config_one, export_config_two], { ExportOutputFormatType.JSON: delegate_one, ExportOutputFormatType.METRIC: delegate_two, }, ) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.json"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.json"), ), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.txt"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.txt"), ), ], any_order=True, ) mock_fs.delete.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json")), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt")), ], any_order=True, )
def fully_process_file(self, dt: datetime.datetime, path: GcsfsFilePath) -> None: """Mimics all the file system calls for a single file in the direct ingest system, from getting added to the ingest bucket, turning to a processed file, then getting moved to storage.""" fixture_util.add_direct_ingest_path( self.fs.gcs_file_system, path, region_code=TEST_STATE_REGION.region_code, has_fixture=False, ) start_num_total_files = len(self.fs.gcs_file_system.all_paths) # pylint: disable=protected-access start_ingest_paths = self.fs._ls_with_file_prefix( self.INGEST_DIR_PATH, "", None) start_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", None) start_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", file_type_filter=GcsfsDirectIngestFileType.RAW_DATA, ) start_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW, ) # File is renamed to normalized path self.fs.mv_path_to_normalized_path(path, GcsfsDirectIngestFileType.RAW_DATA, dt) raw_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.RAW_DATA, ) self.assertEqual(len(raw_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file(raw_unprocessed[0])) # ... raw file imported to BQ processed_path = self.fs.mv_path_to_processed_path(raw_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.fs.copy( processed_path, GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( processed_path.abs_path(), file_type_override=GcsfsDirectIngestFileType.INGEST_VIEW, )), ) self.fs.mv_path_to_storage(processed_path, self.STORAGE_DIR_PATH) ingest_unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW) self.assertEqual(len(ingest_unprocessed), 1) self.assertTrue(self.fs.is_seen_unprocessed_file( ingest_unprocessed[0])) # ... file is ingested # File is moved to processed path self.fs.mv_path_to_processed_path(ingest_unprocessed[0]) processed = self.fs.get_processed_file_paths(self.INGEST_DIR_PATH, None) self.assertEqual(len(processed), 1) self.assertTrue(self.fs.is_processed_file(processed[0])) unprocessed = self.fs.get_unprocessed_file_paths( self.INGEST_DIR_PATH, None) self.assertEqual(len(unprocessed), 0) # File is moved to storage self.fs.mv_processed_paths_before_date_to_storage( self.INGEST_DIR_PATH, self.STORAGE_DIR_PATH, date_str_bound=dt.date().isoformat(), include_bound=True, file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW, ) end_ingest_paths = self.fs._ls_with_file_prefix(self.INGEST_DIR_PATH, "", file_type_filter=None) end_storage_paths = self.fs._ls_with_file_prefix(self.STORAGE_DIR_PATH, "", file_type_filter=None) end_raw_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", file_type_filter=GcsfsDirectIngestFileType.RAW_DATA, ) end_ingest_view_storage_paths = self.fs._ls_with_file_prefix( self.STORAGE_DIR_PATH, "", file_type_filter=GcsfsDirectIngestFileType.INGEST_VIEW, ) # Each file gets re-exported as ingest view splitting_factor = 2 expected_final_total_files = start_num_total_files + splitting_factor - 1 self.assertEqual(len(self.fs.gcs_file_system.all_paths), expected_final_total_files) self.assertEqual(len(end_ingest_paths), len(start_ingest_paths) - 1) self.assertEqual(len(end_storage_paths), len(start_storage_paths) + 1 * splitting_factor) self.assertEqual( len(end_raw_storage_paths) + len(end_ingest_view_storage_paths), len(end_storage_paths), ) self.assertEqual(len(end_raw_storage_paths), len(start_raw_storage_paths) + 1) self.assertEqual( len(end_ingest_view_storage_paths), len(start_ingest_view_storage_paths) + 1, ) for sp in end_storage_paths: parts = filename_parts_from_path(sp) if sp.abs_path() not in { p.abs_path() for p in start_storage_paths }: self.assertTrue(sp.abs_path().startswith( self.STORAGE_DIR_PATH.abs_path())) dir_path, storage_file_name = os.path.split(sp.abs_path()) self.assertTrue(parts.file_type.value in dir_path) name, _ = path.file_name.split(".") self.assertTrue(name in storage_file_name)
def retrieve_data(state_code: str, report_type: str, batch_id: str) -> List[Recipient]: """Retrieves the data for email generation of the given report type for the given state. Get the data from Cloud Storage and return it in a list of dictionaries. Saves the data file into an archive bucket on completion, so that we have the ability to troubleshoot or re-generate a previous batch of emails later on. Args: state_code: State identifier used to retrieve appropriate data report_type: The type of report, used to determine the data file name batch_id: The identifier for this batch Returns: A list of recipient data dictionaries Raises: Non-recoverable errors that should stop execution. Attempts to catch and handle errors that are recoverable. Provides logging for debug purposes whenever possible. """ data_bucket = utils.get_data_storage_bucket_name() data_filename = "" gcs_file_system = GcsfsFactory.build() try: data_filename = utils.get_data_filename(state_code, report_type) path = GcsfsFilePath.from_absolute_path(f"gs://{data_bucket}/{data_filename}") file_contents = gcs_file_system.download_as_string(path) except BaseException: logging.info("Unable to load data file %s/%s", data_bucket, data_filename) raise archive_bucket = utils.get_data_archive_bucket_name() archive_filename = "" try: archive_filename = utils.get_data_archive_filename(batch_id) archive_path = GcsfsFilePath.from_absolute_path( f"gs://{archive_bucket}/{archive_filename}" ) gcs_file_system.upload_from_string( path=archive_path, contents=file_contents, content_type="text/json" ) except Exception: logging.error( "Unable to archive the data file to %s/%s", archive_bucket, archive_filename ) raise json_list = file_contents.splitlines() recipient_data: List[dict] = [] for json_str in json_list: try: item = json.loads(json_str) except Exception as err: logging.error( "Unable to parse JSON found in the file %s. Offending json string is: '%s'. <%s> %s", data_filename, json_str, type(err).__name__, err, ) else: recipient_data.append(item) logging.info( "Retrieved %s recipients from data file %s", len(recipient_data), data_filename ) return [ Recipient.from_report_json( { **recipient, utils.KEY_BATCH_ID: batch_id, } ) for recipient in recipient_data ]
def get_output_path(self, chunk_num: int): name, _extension = os.path.splitext(self.path.file_name) return GcsfsFilePath.from_directory_and_file_name( self.temp_output_directory_path, f'temp_{name}_{chunk_num}.csv')
def test_filename_parts_from_path_unspecified_file_type(self): with self.assertRaises(DirectIngestError): filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket/us_ca_sf/elite_offenders.csv')) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_' 'elite_offenders.csv')) self.assertEqual(parts.processed_state, 'unprocessed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'elite_offenders') self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-08-07T22:09:18:770655')) self.assertEqual(parts.date_str, '2019-08-07') self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'elite_offenders.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'elite_offenders') self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'elite_offenders_1split.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'elite_offenders') self.assertEqual(parts.filename_suffix, '1split') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') # Needs the actual file_split suffix to be a file split self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'elite_offenders_002_file_split.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'elite_offenders') self.assertEqual(parts.filename_suffix, '002_file_split') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'elite_offenders_002_file_split_size300.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'elite_offenders') self.assertEqual(parts.filename_suffix, '002_file_split_size300') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'BrazosCounty_2019_09_25.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'BrazosCounty') self.assertEqual(parts.filename_suffix, '2019_09_25') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-nd/processed_2019-09-07T00:09:18:770655_' 'BrazosCounty_2019_09_25_002_file_split_size300.csv')) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'BrazosCounty') self.assertEqual(parts.filename_suffix, '2019_09_25_002_file_split_size300') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_' 'tak001_offender_identification.csv')) self.assertEqual(parts.processed_state, 'unprocessed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'tak001_offender_identification') self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_' 'tak001_offender_identification_002_file_split_size300.csv')) self.assertEqual(parts.processed_state, 'unprocessed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'tak001_offender_identification') self.assertEqual(parts.filename_suffix, '002_file_split_size300') self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2019-09-07T00:09:18:770655')) self.assertEqual(parts.date_str, '2019-09-07') self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( 'storage_bucket/region_subdir/2020-04-29/processed_2020-04-29T18:02:41:789323_test_file-(1).csv' )) self.assertEqual(parts.processed_state, 'processed') self.assertEqual(parts.extension, 'csv') self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.UNSPECIFIED) self.assertEqual(parts.file_tag, 'test_file') self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat('2020-04-29T18:02:41:789323')) self.assertEqual(parts.date_str, '2020-04-29') self.assertEqual(parts.is_file_split, False)
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) exporter.export_and_validate([export_config_one, export_config_two]) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')), ])
def _lock_body_for_lock(self, name: str) -> Optional[GCSPseudoLockBody]: path = GcsfsFilePath(bucket_name=self.bucket_name, blob_name=name) return self._lock_body_for_path(path)
def test_filename_parts_from_path_with_file_type(self) -> None: with self.assertRaises(DirectIngestError): filename_parts_from_path( GcsfsFilePath.from_absolute_path("bucket/us_ca_sf/elite_offenders.csv") ) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/unprocessed_2019-08-07T22:09:18:770655_" "raw_elite_offenders.csv" ) ) self.assertEqual(parts.processed_state, "unprocessed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "elite_offenders") self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-08-07T22:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-08-07") self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "ingest_view_elite_offenders.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW) self.assertEqual(parts.file_tag, "elite_offenders") self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "raw_elite_offenders_1split.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "elite_offenders") self.assertEqual(parts.filename_suffix, "1split") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") # Needs the actual file_split suffix to be a file split self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "ingest_view_elite_offenders_002_file_split.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW) self.assertEqual(parts.file_tag, "elite_offenders") self.assertEqual(parts.filename_suffix, "002_file_split") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "raw_elite_offenders_002_file_split_size300.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "elite_offenders") self.assertEqual(parts.filename_suffix, "002_file_split_size300") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "ingest_view_BrazosCounty_2019_09_25.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW) self.assertEqual(parts.file_tag, "BrazosCounty") self.assertEqual(parts.filename_suffix, "2019_09_25") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-nd/processed_2019-09-07T00:09:18:770655_" "raw_BrazosCounty_2019_09_25_002_file_split_size300.csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "BrazosCounty") self.assertEqual(parts.filename_suffix, "2019_09_25_002_file_split_size300") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_" "ingest_view_tak001_offender_identification.csv" ) ) self.assertEqual(parts.processed_state, "unprocessed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.INGEST_VIEW) self.assertEqual(parts.file_tag, "tak001_offender_identification") self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, False) self.assertEqual(parts.file_split_size, None) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_" "raw_tak001_offender_identification_002_file_split_size300.csv" ) ) self.assertEqual(parts.processed_state, "unprocessed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "tak001_offender_identification") self.assertEqual(parts.filename_suffix, "002_file_split_size300") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "storage_bucket/raw/2020/04/29/processed_2020-04-29T18:02:41:789323_raw_test_file-(1).csv" ) ) self.assertEqual(parts.processed_state, "processed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "test_file") self.assertEqual(parts.filename_suffix, None) self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2020-04-29T18:02:41:789323"), ) self.assertEqual(parts.date_str, "2020-04-29") self.assertEqual(parts.is_file_split, False) parts = filename_parts_from_path( GcsfsFilePath.from_absolute_path( "bucket-us-mo/unprocessed_2019-09-07T00:09:18:770655_" "raw_tak001_offender_identification_002_file_split_size300-(5).csv" ) ) self.assertEqual(parts.processed_state, "unprocessed") self.assertEqual(parts.extension, "csv") self.assertEqual(parts.file_type, GcsfsDirectIngestFileType.RAW_DATA) self.assertEqual(parts.file_tag, "tak001_offender_identification") self.assertEqual(parts.filename_suffix, "002_file_split_size300") self.assertEqual( parts.utc_upload_datetime, datetime.datetime.fromisoformat("2019-09-07T00:09:18:770655"), ) self.assertEqual(parts.date_str, "2019-09-07") self.assertEqual(parts.is_file_split, True) self.assertEqual(parts.file_split_size, 300)
def get_output_path(self, chunk_num: int) -> GcsfsFilePath: name, _extension = os.path.splitext(self.path.file_name) return GcsfsFilePath.from_directory_and_file_name( self.output_directory_path, f"temp_direct_ingest_{name}_{chunk_num}.csv")
def export(self, export_configs: Sequence[ExportBigQueryViewConfig]) -> List[GcsfsFilePath]: export_query_configs = [c.as_export_query_config(bigquery.DestinationFormat.NEWLINE_DELIMITED_JSON) for c in export_configs] self.bq_client.export_query_results_to_cloud_storage(export_query_configs) return [GcsfsFilePath.from_absolute_path(config.output_uri) for config in export_query_configs]
def post_process_downloads(self, downloaded_path: GcsfsFilePath, _: GCSFileSystem) -> str: """The US_ID server doesn't require any post-processing.""" return downloaded_path.abs_path()
def _gcsfs_path_for_batch_metadata( batch_id: str, state_code: StateCode ) -> GcsfsFilePath: return GcsfsFilePath.from_absolute_path( f"gs://{get_email_content_bucket_name()}/{state_code.value}/{batch_id}/metadata.json" )
def delete(self, path: GcsfsFilePath) -> None: with self.mutex: self.files.pop(path.abs_path())
def _split_file_if_necessary(self, path: GcsfsFilePath) -> bool: """Checks if the given file needs to be split according to this controller's |file_split_line_limit|. Returns True if the file was split, False if splitting was not necessary. """ should_split = self._should_split_file(path) if not should_split: logging.info("No need to split file path [%s].", path.abs_path()) return False logging.info("Proceeding to file splitting for path [%s].", path.abs_path()) original_metadata = None if self.region.are_ingest_view_exports_enabled_in_env(): original_metadata = self.file_metadata_manager.get_file_metadata( path) output_dir = GcsfsDirectoryPath.from_file_path(path) split_contents_paths = self._split_file(path) upload_paths = [] for i, split_contents_path in enumerate(split_contents_paths): upload_path = self._create_split_file_path(path, output_dir, split_num=i) logging.info( "Copying split [%s] to direct ingest directory at path [%s].", i, upload_path.abs_path(), ) upload_paths.append(upload_path) try: self.fs.mv(split_contents_path, upload_path) except Exception as e: logging.error( "Threw error while copying split files from temp bucket - attempting to clean up before rethrowing." " [%s]", e, ) for p in upload_paths: self.fs.delete(p) raise e # We wait to register files with metadata manager until all files have been successfully copied to avoid leaving # the metadata manager in an inconsistent state. if self.region.are_ingest_view_exports_enabled_in_env(): if not isinstance(original_metadata, DirectIngestIngestFileMetadata): raise ValueError( "Attempting to split a non-ingest view type file") logging.info( "Registering [%s] split files with the metadata manager.", len(upload_paths), ) for upload_path in upload_paths: ingest_file_metadata = ( self.file_metadata_manager.register_ingest_file_split( original_metadata, upload_path)) self.file_metadata_manager.mark_ingest_view_exported( ingest_file_metadata) self.file_metadata_manager.mark_file_as_processed(path) logging.info( "Done splitting file [%s] into [%s] paths, moving it to storage.", path.abs_path(), len(split_contents_paths), ) self.fs.mv_path_to_storage(path, self.storage_directory_path) return True
def get_attachment_filepath(batch_id: str, email_address: str) -> GcsfsFilePath: bucket = get_email_content_bucket_name() folder = get_attachments_folder(batch_id) return GcsfsFilePath.from_absolute_path( f"gs://{bucket}/{folder}/{email_address}.txt" )