def _export_optimized_format( self, export_config: ExportBigQueryViewConfig, formatted: OptimizedMetricRepresentation, storage_client: storage.Client, ) -> GcsfsFilePath: """Writes the optimized metric representation to Cloud Storage, based on the export configuration. Returns the output path the file was written to. """ output_path = export_config.output_path(extension="txt") logging.info( "Writing optimized metric file %s to GCS bucket %s...", output_path.blob_name, output_path.bucket_name, ) blob = storage.Blob.from_string(output_path.uri(), client=storage_client) self._set_format_metadata(formatted, blob, should_compress=True) blob.upload_from_string( self._produce_transmission_format(formatted, should_compress=True), content_type="text/plain", ) logging.info( "Optimized metric file %s written to GCS bucket %s.", output_path.blob_name, output_path.bucket_name, ) return output_path
def setUp(self) -> None: self.metadata_patcher = patch("recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = "project-id" self.mock_bq_view_namespace = BigQueryViewNamespace.STATE metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", description="view1 description", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", description="view2 description", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) self.staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ]
def setUp(self) -> None: self.metadata_patcher = patch('recidiviz.utils.metadata.project_id') self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = 'project-id' metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) self.staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ]
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) exporter.export_and_validate([export_config_one, export_config_two]) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')), ])
def test_export_final_existence_validation_failed(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) # This should cause export_and_validate to raise a ValueError mock_fs.exists.return_value = False delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) with pytest.raises(ViewExportValidationError) as e: exporter.export_and_validate( [export_config_one, export_config_two]) # We get an error at the very end of the export chain because even though delegate validations passed, the # final validation failed self.assertIn( 'Validation on path bucket1/US_XX/view1.json failed the metric file export. ' 'Stopping execution here.', str(e.value)) # The delegate exporters validations all passed so we still copy from staging to final delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt'), GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt'), GcsfsFilePath(bucket_name='bucket2', blob_name='US_XX/view2.txt')) ]) mock_fs.delete.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.json')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.json')), call( GcsfsFilePath(bucket_name='bucket1', blob_name='staging/US_XX/view1.txt')), call( GcsfsFilePath(bucket_name='bucket2', blob_name='staging/US_XX/view2.txt')) ]) # Only one call to the Exists validation made because the first one failed mock_fs.exists.assert_has_calls([ call( GcsfsFilePath(bucket_name='bucket1', blob_name='US_XX/view1.json')), ])
def test_export_staging_delegate_validation_failed(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view1', view_query_template='select * from table', dimensions=['a', 'b', 'c'], ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/US_XX'), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket1/staging/US_XX'), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id='dataset', view_id='view2', view_query_template='select * from view2', dimensions=['d', 'e', 'f'], ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/US_XX'), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause='WHERE state_code = \'US_XX\'', intermediate_table_name='intermediate_table2', output_directory=GcsfsDirectoryPath.from_absolute_path( 'gs://bucket2/staging/US_XX'), ) mock_bq_client = create_autospec(BigQueryClient) mock_fs = create_autospec(GCSFileSystem) delegate_one = create_autospec(BigQueryViewExporter) delegate_two = create_autospec(BigQueryViewExporter) delegate_one.export_and_validate.return_value = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_two.export_and_validate.return_value = [ export_config_one_staging.output_path('txt'), export_config_two_staging.output_path('txt') ] delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path('json'), export_config_two_staging.output_path('json') ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two.export_and_validate.side_effect = ViewExportValidationError( 'Validation failed') # Make the actual call exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs, [delegate_one, delegate_two]) with pytest.raises(ViewExportValidationError) as e: exporter.export_and_validate( [export_config_one, export_config_two]) self.assertIn('Validation failed', str(e.value))
def test_export_happy_path(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/US_XX"), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/US_XX"), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) mock_fs = create_autospec(GCSFileSystem) mock_fs.exists.return_value = True delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path("json"), export_config_two_staging.output_path("json"), ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two_staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ] delegate_two.export_and_validate.return_value = delegate_two_staging_paths # Make the actual call export_views_with_exporters( mock_fs, [export_config_one, export_config_two], { ExportOutputFormatType.JSON: delegate_one, ExportOutputFormatType.METRIC: delegate_two, }, ) # Assert all mocks called as expected delegate_one.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) delegate_two.export_and_validate.assert_has_calls([ call([export_config_one_staging, export_config_two_staging]), ]) mock_fs.copy.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.json"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.json"), ), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt"), GcsfsFilePath(bucket_name="bucket1", blob_name="US_XX/view1.txt"), ), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt"), GcsfsFilePath(bucket_name="bucket2", blob_name="US_XX/view2.txt"), ), ], any_order=True, ) mock_fs.delete.assert_has_calls( [ call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.json")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.json")), call( GcsfsFilePath(bucket_name="bucket1", blob_name="staging/US_XX/view1.txt")), call( GcsfsFilePath(bucket_name="bucket2", blob_name="staging/US_XX/view2.txt")), ], any_order=True, )
def test_export_staging_delegate_validation_failed(self) -> None: metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/US_XX"), ) export_config_one_staging = ExportBigQueryViewConfig( view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/US_XX"), ) export_config_two_staging = ExportBigQueryViewConfig( view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) mock_fs = create_autospec(GCSFileSystem) delegate_one = create_autospec(BigQueryViewExporter) delegate_two = create_autospec(BigQueryViewExporter) delegate_one.export_and_validate.return_value = [ export_config_one_staging.output_path("json"), export_config_two_staging.output_path("json"), ] delegate_two.export_and_validate.return_value = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ] delegate_one = create_autospec(BigQueryViewExporter) delegate_one_staging_paths = [ export_config_one_staging.output_path("json"), export_config_two_staging.output_path("json"), ] delegate_one.export_and_validate.return_value = delegate_one_staging_paths delegate_two = create_autospec(BigQueryViewExporter) delegate_two.export_and_validate.side_effect = ViewExportValidationError( "Validation failed") # Make the actual call with pytest.raises(ViewExportValidationError) as e: export_views_with_exporters( mock_fs, [export_config_one, export_config_two], { ExportOutputFormatType.JSON: delegate_one, ExportOutputFormatType.METRIC: delegate_two, }, ) self.assertIn("Validation failed", str(e.value))