def _export_optimized_format(
        self,
        export_config: ExportBigQueryViewConfig,
        formatted: OptimizedMetricRepresentation,
        storage_client: storage.Client,
    ) -> GcsfsFilePath:
        """Writes the optimized metric representation to Cloud Storage, based on the export configuration. Returns the
        output path the file was written to.
        """
        output_path = export_config.output_path(extension="txt")

        logging.info(
            "Writing optimized metric file %s to GCS bucket %s...",
            output_path.blob_name,
            output_path.bucket_name,
        )

        blob = storage.Blob.from_string(output_path.uri(), client=storage_client)
        self._set_format_metadata(formatted, blob, should_compress=True)
        blob.upload_from_string(
            self._produce_transmission_format(formatted, should_compress=True),
            content_type="text/plain",
        )

        logging.info(
            "Optimized metric file %s written to GCS bucket %s.",
            output_path.blob_name,
            output_path.bucket_name,
        )

        return output_path
    def setUp(self) -> None:
        self.metadata_patcher = patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "project-id"

        self.mock_bq_view_namespace = BigQueryViewNamespace.STATE

        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            description="view1 description",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            description="view2 description",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        self.staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
Пример #3
0
    def setUp(self) -> None:
        self.metadata_patcher = patch('recidiviz.utils.metadata.project_id')
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = 'project-id'

        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        self.staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
Пример #4
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])
        exporter.export_and_validate([export_config_one, export_config_two])

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt')),
        ])
Пример #5
0
    def test_export_final_existence_validation_failed(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        # This should cause export_and_validate to raise a ValueError
        mock_fs.exists.return_value = False

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])

        with pytest.raises(ViewExportValidationError) as e:
            exporter.export_and_validate(
                [export_config_one, export_config_two])

        # We get an error at the very end of the export chain because even though delegate validations passed, the
        # final validation failed
        self.assertIn(
            'Validation on path bucket1/US_XX/view1.json failed the metric file export. '
            'Stopping execution here.', str(e.value))

        # The delegate exporters validations all passed so we still copy from staging to final
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt'),
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'),
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='US_XX/view2.txt'))
        ])

        mock_fs.delete.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.json')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.json')),
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='staging/US_XX/view1.txt')),
            call(
                GcsfsFilePath(bucket_name='bucket2',
                              blob_name='staging/US_XX/view2.txt'))
        ])

        # Only one call to the Exists validation made because the first one failed
        mock_fs.exists.assert_has_calls([
            call(
                GcsfsFilePath(bucket_name='bucket1',
                              blob_name='US_XX/view1.json')),
        ])
Пример #6
0
    def test_export_staging_delegate_validation_failed(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/US_XX'),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/US_XX'),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        mock_bq_client = create_autospec(BigQueryClient)
        mock_fs = create_autospec(GCSFileSystem)

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_two = create_autospec(BigQueryViewExporter)

        delegate_one.export_and_validate.return_value = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_two.export_and_validate.return_value = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path('json'),
            export_config_two_staging.output_path('json')
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two.export_and_validate.side_effect = ViewExportValidationError(
            'Validation failed')

        # Make the actual call
        exporter = CompositeBigQueryViewExporter(mock_bq_client, mock_fs,
                                                 [delegate_one, delegate_two])

        with pytest.raises(ViewExportValidationError) as e:
            exporter.export_and_validate(
                [export_config_one, export_config_two])

        self.assertIn('Validation failed', str(e.value))
Пример #7
0
    def test_export_happy_path(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/US_XX"),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/US_XX"),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        mock_fs = create_autospec(GCSFileSystem)

        mock_fs.exists.return_value = True

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path("json"),
            export_config_two_staging.output_path("json"),
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two_staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
        delegate_two.export_and_validate.return_value = delegate_two_staging_paths

        # Make the actual call
        export_views_with_exporters(
            mock_fs,
            [export_config_one, export_config_two],
            {
                ExportOutputFormatType.JSON: delegate_one,
                ExportOutputFormatType.METRIC: delegate_two,
            },
        )

        # Assert all mocks called as expected
        delegate_one.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        delegate_two.export_and_validate.assert_has_calls([
            call([export_config_one_staging, export_config_two_staging]),
        ])

        mock_fs.copy.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.json"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt"),
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="US_XX/view1.txt"),
                ),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt"),
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="US_XX/view2.txt"),
                ),
            ],
            any_order=True,
        )

        mock_fs.delete.assert_has_calls(
            [
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.json")),
                call(
                    GcsfsFilePath(bucket_name="bucket1",
                                  blob_name="staging/US_XX/view1.txt")),
                call(
                    GcsfsFilePath(bucket_name="bucket2",
                                  blob_name="staging/US_XX/view2.txt")),
            ],
            any_order=True,
        )
Пример #8
0
    def test_export_staging_delegate_validation_failed(self) -> None:
        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/US_XX"),
        )
        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/US_XX"),
        )
        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        mock_fs = create_autospec(GCSFileSystem)

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_two = create_autospec(BigQueryViewExporter)

        delegate_one.export_and_validate.return_value = [
            export_config_one_staging.output_path("json"),
            export_config_two_staging.output_path("json"),
        ]
        delegate_two.export_and_validate.return_value = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]

        delegate_one = create_autospec(BigQueryViewExporter)
        delegate_one_staging_paths = [
            export_config_one_staging.output_path("json"),
            export_config_two_staging.output_path("json"),
        ]
        delegate_one.export_and_validate.return_value = delegate_one_staging_paths

        delegate_two = create_autospec(BigQueryViewExporter)
        delegate_two.export_and_validate.side_effect = ViewExportValidationError(
            "Validation failed")

        # Make the actual call
        with pytest.raises(ViewExportValidationError) as e:
            export_views_with_exporters(
                mock_fs,
                [export_config_one, export_config_two],
                {
                    ExportOutputFormatType.JSON: delegate_one,
                    ExportOutputFormatType.METRIC: delegate_two,
                },
            )

        self.assertIn("Validation failed", str(e.value))