예제 #1
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
 def __init__(
     self,
     region_code: str,
     start_date_bound: Optional[str],
     end_date_bound: Optional[str],
     dry_run: bool,
     project_id: str,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.project_id = project_id
     self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id,
         ))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}"
         f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
    def __init__(
        self,
        region_code: str,
        file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
    ):
        self.file_type = file_type
        self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-123"
            )
        )
        self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-staging"
            )
        )
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
    def test_get_configs_for_export_name(
            self, mock_environment: mock.MagicMock) -> None:
        """Tests get_configs_for_export_name function to ensure that export names correctly match"""

        mock_environment.return_value = "production"
        export_configs_for_filter = view_export_manager.get_configs_for_export_name(
            export_name=self.mock_export_name,
            state_code=self.mock_state_code,
            project_id=self.mock_project_id,
        )
        view = self.mock_view_builder.build()
        metric_view = self.mock_metric_view_builder.build()

        expected_view_config_list = [
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=view,
                view_filter_clause=
                f" WHERE state_code = '{self.mock_state_code}'",
                intermediate_table_name=
                f"{view.view_id}_table_{self.mock_state_code}",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code=self.mock_state_code,
                    )),
                export_output_formats=[ExportOutputFormatType.JSON],
            ),
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=metric_view,
                view_filter_clause=
                f" WHERE state_code = '{self.mock_state_code}'",
                intermediate_table_name=
                f"{view.view_id}_table_{self.mock_state_code}",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code=self.mock_state_code,
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            ),
        ]

        self.assertEqual(expected_view_config_list, export_configs_for_filter)

        # Test for case insensitivity

        export_configs_for_filter = view_export_manager.get_configs_for_export_name(
            export_name=self.mock_export_name.lower(),
            state_code=self.mock_state_code.lower(),
            project_id=self.mock_project_id,
        )
        self.assertEqual(expected_view_config_list, export_configs_for_filter)
예제 #5
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
예제 #6
0
    def setUp(self) -> None:
        self.mock_bq_client = mock.create_autospec(BigQueryClient)
        self.mock_validator = mock.create_autospec(BigQueryViewExportValidator)

        self.mock_project_id = "fake-project"

        self.metadata_patcher = mock.patch(
            "recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = self.mock_project_id

        self.view_builder = SimpleBigQueryViewBuilder(
            dataset_id="test_dataset",
            view_id="test_view",
            view_query_template="SELECT NULL LIMIT 0",
        )
        self.second_view_builder = SimpleBigQueryViewBuilder(
            dataset_id="test_dataset",
            view_id="test_view_2",
            view_query_template="SELECT NULL LIMIT 0",
        )
        self.view_export_configs = [
            ExportBigQueryViewConfig(
                view=self.view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.HEADERLESS_CSV,
                ],
            ),
            ExportBigQueryViewConfig(
                view=self.second_view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.second_view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.CSV,
                ],
            ),
        ]
    def test_export_dashboard_data_to_cloud_storage(
            self, mock_view_exporter,
            mock_view_update_manager_rematerialize) -> None:
        """Tests the table is created from the view and then extracted."""
        view_export_manager.export_view_data_to_cloud_storage(
            self.mock_state_code, mock_view_exporter)

        view = self.mock_view_builder.build()
        metric_view = self.mock_metric_view_builder.build()

        view_export_configs = [
            ExportBigQueryViewConfig(
                view=view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[ExportOutputFormatType.JSON],
            ),
            ExportBigQueryViewConfig(
                view=metric_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            ),
        ]

        mock_view_update_manager_rematerialize.assert_called()
        mock_view_exporter.export_and_validate.assert_has_calls(
            [
                mock.call([]),  # CSV export
                mock.call([
                    view_export_configs[1].pointed_to_staging_subdirectory()
                ]),  # JSON export
                mock.call([
                    conf.pointed_to_staging_subdirectory()
                    for conf in view_export_configs
                ]),  # METRIC export
            ],
            any_order=True,
        )
예제 #8
0
    def __init__(
        self,
        project_id: str,
        region: str,
        file_type_to_move: GcsfsDirectIngestFileType,
        destination_file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if (
            self.file_type_to_move != self.destination_file_type
            and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED
        ):
            raise ValueError(
                "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED"
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
예제 #9
0
    def export_configs_for_views_to_export(self, project_id: str) -> Sequence[ExportMetricBigQueryViewConfig]:
        """Builds a list of ExportMetricBigQueryViewConfigs that define how all metric views in
        metric_view_builders_to_export should be exported to Google Cloud Storage."""
        view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'"
                              if self.state_code_filter else None)

        intermediate_table_name = "{export_view_name}_table"
        output_directory = self.output_directory_uri_template.format(
            project_id=project_id
        )

        if self.state_code_filter:
            intermediate_table_name += f"_{self.state_code_filter}"
            output_directory += f"/{self.state_code_filter}"

        return [
            ExportMetricBigQueryViewConfig(
                view=view,
                view_filter_clause=view_filter_clause,
                intermediate_table_name=intermediate_table_name.format(
                    export_view_name=view.view_id
                ),
                output_directory=GcsfsDirectoryPath.from_absolute_path(output_directory),
            )
            for view in [vb.build() for vb in self.metric_view_builders_to_export]
        ]
    def test_metric_export_state_agnostic(self):
        """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the
        export is state-agnostic."""
        state_agnostic_dataset_export_config = ExportMetricDatasetConfig(
            dataset_id='dataset_id',
            metric_view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template=
            "gs://{project_id}-bucket-without-state-codes",
            state_code_filter=None,
            export_name=None)

        view_configs_to_export = state_agnostic_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id)

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportMetricBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=None,
                intermediate_table_name=f"{expected_view.view_id}_table",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    state_agnostic_dataset_export_config.
                    output_directory_uri_template.format(
                        project_id=self.mock_project_id, )))
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
예제 #11
0
    def test_metric_export_lantern_dashboard(self) -> None:
        """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the
        export is state-agnostic."""
        lantern_dashboard_dataset_export_config = ExportViewCollectionConfig(
            view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template=
            "gs://{project_id}-bucket-without-state-codes",
            export_name="TEST_EXPORT",
            bq_view_namespace=self.mock_big_query_view_namespace,
        )

        view_configs_to_export = (lantern_dashboard_dataset_export_config.
                                  export_configs_for_views_to_export(
                                      project_id=self.mock_project_id, ))

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_big_query_view_namespace,
                view=expected_view,
                view_filter_clause=None,
                intermediate_table_name=f"{expected_view.view_id}_table",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    lantern_dashboard_dataset_export_config.
                    output_directory_uri_template.format(
                        project_id=self.mock_project_id, )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            )
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
    def test_export_dashboard_data_to_cloud_storage_validation_error(self,
                                                                     mock_view_exporter,
                                                                     mock_view_update_manager):
        """Tests the table is created from the view and then extracted."""

        mock_view_exporter.export_and_validate.side_effect = ViewExportValidationError

        # Should not throw
        metric_view_export_manager.export_view_data_to_cloud_storage(mock_state_code, mock_view_exporter)

        view = self.mock_view_builder.build()

        view_export_configs = [ExportMetricBigQueryViewConfig(
            view=view,
            view_filter_clause=" WHERE state_code = 'US_XX'",
            intermediate_table_name=f"{view.view_id}_table_US_XX",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://{project_id}-dataset-location/subdirectory/{state_code}".format(
                    project_id=self.mock_project_id,
                    state_code='US_XX',
                )
            )
        )]

        mock_view_update_manager.assert_called()
        mock_view_exporter.export_and_validate.assert_called_with(view_export_configs)
예제 #13
0
 def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]:
     """Returns the appropriate paths to upload and the proper associated timestamp that
     it is to be normalized with. Skips any files that are not properly supported."""
     path_candidates = []
     for path, timestamp in self.paths_with_timestamps:
         if self.gcsfs.is_dir(path):
             directory = GcsfsDirectoryPath.from_absolute_path(path)
             files_in_directory = self.gcsfs.ls_with_blob_prefix(
                 bucket_name=directory.bucket_name,
                 blob_prefix=directory.relative_path,
             )
             for file in files_in_directory:
                 if self._is_supported_extension(file.abs_path()):
                     path_candidates.append((file.abs_path(), timestamp))
                 else:
                     self.skipped_files.append(file.abs_path())
         elif self.gcsfs.is_file(path):
             file = GcsfsFilePath.from_absolute_path(path)
             if self._is_supported_extension(file.abs_path()):
                 path_candidates.append((file.abs_path(), timestamp))
             else:
                 self.skipped_files.append(file.abs_path())
         else:
             logging.warning(
                 "Could not indicate %s as a directory or a file in %s. Skipping",
                 path,
                 self.destination_ingest_bucket.uri(),
             )
             self.unable_to_upload_files.append(path)
             continue
     return path_candidates
    def test_metric_export_state_specific(self):
        """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the
        export is state-specific."""
        specific_state_dataset_export_config = ExportMetricDatasetConfig(
            dataset_id='dataset_id',
            metric_view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template="gs://{project_id}-bucket",
            state_code_filter='US_XX',
            export_name=None)

        view_configs_to_export = specific_state_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id)

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportMetricBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{expected_view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    f"gs://{self.mock_project_id}-bucket/US_XX"))
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
예제 #15
0
    def test_metric_export_lantern_dashboard_with_state(self):
        """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the
        export is state-specific."""
        lantern_dashboard_with_state_dataset_export_config = ExportViewCollectionConfig(
            view_builders_to_export=self.views_for_dataset,
            output_directory_uri_template="gs://{project_id}-bucket",
            state_code_filter="US_XX",
            export_name="TEST_EXPORT",
            bq_view_namespace=self.mock_big_query_view_namespace,
        )

        view_configs_to_export = lantern_dashboard_with_state_dataset_export_config.export_configs_for_views_to_export(
            project_id=self.mock_project_id
        )

        expected_view = self.mock_view_builder.build()

        expected_view_export_configs = [
            ExportBigQueryViewConfig(
                view=expected_view,
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=f"{expected_view.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    f"gs://{self.mock_project_id}-bucket/US_XX"
                ),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.METRIC,
                ],
            )
        ]

        self.assertEqual(expected_view_export_configs, view_configs_to_export)
예제 #16
0
    def export_configs_for_views_to_export(
            self, project_id: str) -> Sequence[ExportBigQueryViewConfig]:
        """Builds a list of ExportBigQueryViewConfig that define how all views in
        view_builders_to_export should be exported to Google Cloud Storage."""
        view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'"
                              if self.state_code_filter else None)

        intermediate_table_name = "{export_view_name}_table"
        output_directory = self.output_directory_uri_template.format(
            project_id=project_id)

        if self.state_code_filter:
            intermediate_table_name += f"_{self.state_code_filter}"
            output_directory += f"/{self.state_code_filter}"

        configs = []
        for vb in self.view_builders_to_export:
            view = vb.build()
            optional_args = {}
            if self.export_output_formats is not None:
                optional_args[
                    "export_output_formats"] = self.export_output_formats
            configs.append(
                ExportBigQueryViewConfig(
                    view=view,
                    view_filter_clause=view_filter_clause,
                    intermediate_table_name=intermediate_table_name.format(
                        export_view_name=view.view_id),
                    output_directory=GcsfsDirectoryPath.from_absolute_path(
                        output_directory),
                    **optional_args,
                ))
        return configs
 def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str,
              start_date_bound: Optional[str],
              end_date_bound: Optional[str], dry_run: bool, project_id: str,
              file_filter: Optional[str]):
     self.file_type = file_type
     self.region_code = region_code
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.file_filter = file_filter
     self.project_id = project_id
     self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             self.file_type,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}'
         f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
예제 #18
0
 def create_export_manager(
     self,
     region: Region,
     is_detect_row_deletion_view: bool = False,
     materialize_raw_data_table_views: bool = False,
     controller_file_tags: Optional[List[str]] = None,
 ) -> DirectIngestIngestViewExportManager:
     metadata_manager = PostgresDirectIngestFileMetadataManager(
         region.region_code)
     controller_file_tags = (["ingest_view"] if controller_file_tags is None
                             else controller_file_tags)
     return DirectIngestIngestViewExportManager(
         region=region,
         fs=FakeGCSFileSystem(),
         ingest_directory_path=GcsfsDirectoryPath.from_absolute_path(
             "ingest_bucket"),
         big_query_client=self.mock_client,
         file_metadata_manager=metadata_manager,
         view_collector=_ViewCollector(  # type: ignore[arg-type]
             region,
             controller_file_tags=controller_file_tags,
             is_detect_row_deletion_view=is_detect_row_deletion_view,
             materialize_raw_data_table_views=
             materialize_raw_data_table_views,
         ),
         launched_file_tags=controller_file_tags,
     )
    def setUp(self) -> None:
        self.metadata_patcher = patch("recidiviz.utils.metadata.project_id")
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = "project-id"

        self.mock_bq_view_namespace = BigQueryViewNamespace.STATE

        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view1",
            description="view1 description",
            view_query_template="select * from table",
            dimensions=("a", "b", "c"),
        ).build()

        export_config_one_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_one,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket1/staging/US_XX"),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id="dataset",
            view_id="view2",
            description="view2 description",
            view_query_template="select * from view2",
            dimensions=("d", "e", "f"),
        ).build()

        export_config_two_staging = ExportBigQueryViewConfig(
            bq_view_namespace=self.mock_bq_view_namespace,
            view=metric_view_two,
            view_filter_clause="WHERE state_code = 'US_XX'",
            intermediate_table_name="intermediate_table2",
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                "gs://bucket2/staging/US_XX"),
        )

        self.staging_paths = [
            export_config_one_staging.output_path("txt"),
            export_config_two_staging.output_path("txt"),
        ]
예제 #20
0
 def is_dir(self, path: str) -> bool:
     try:
         directory = GcsfsDirectoryPath.from_absolute_path(path)
         has_dir = self.ls_with_blob_prefix(
             bucket_name=directory.bucket_name,
             blob_prefix=directory.relative_path)
         return len(has_dir) > 0
     except ValueError:
         return False
예제 #21
0
def gcsfs_direct_ingest_temporary_output_directory_path(
    project_id: Optional[str] = None,
) -> GcsfsDirectoryPath:
    if project_id is None:
        project_id = metadata.project_id()
        if not project_id:
            raise ValueError("Project id not set")

    return GcsfsDirectoryPath.from_absolute_path(
        f"{project_id}-direct-ingest-temporary-files"
    )
예제 #22
0
 def config_with_path(self, path: str) -> ExportBigQueryViewConfig:
     return ExportBigQueryViewConfig(
         view=SimpleBigQueryViewBuilder(
             dataset_id="test_dataset",
             view_id="test_view",
             view_query_template="you know",
         ).build(),
         view_filter_clause="WHERE state_code = 'US_XX'",
         intermediate_table_name="tubular",
         output_directory=GcsfsDirectoryPath.from_absolute_path(
             f"gs://{path}"),
     )
    def _copy_files_for_date(self, subdir_path_str: str) -> None:
        dir_path = GcsfsDirectoryPath.from_absolute_path(subdir_path_str.rstrip("/"))

        from_path = f"gs://{self.prod_region_storage_dir_path.bucket_name}/{dir_path.relative_path}*"
        to_path = f"gs://{self.staging_region_storage_dir_path.bucket_name}/{dir_path.relative_path}"

        if not self.dry_run:
            gsutil_cp(from_path=from_path, to_path=to_path)
        with self.mutex:
            self.copy_list.append((from_path, to_path))
            if self.copy_progress:
                self.copy_progress.next()
예제 #24
0
    def direct_ingest_storage_directory(self) -> GcsfsDirectoryPath:
        if in_gcp():
            return gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code,
                system_level=SystemLevel.STATE,
                ingest_instance=DirectIngestInstance.PRIMARY,
            )

        # Local override
        return GcsfsDirectoryPath.from_absolute_path(
            f"recidiviz-staging-direct-ingest-state-storage/{self.region_code.lower()}"
        )
    def test_json_throws(self) -> None:
        exporter = JsonLinesBigQueryViewExporter(self.mock_bq_client,
                                                 self.mock_validator)
        view_export_configs = [
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_bq_view_namespace,
                view=self.view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[
                    ExportOutputFormatType.JSON,
                    ExportOutputFormatType.HEADERLESS_CSV,
                ],
            ),
            ExportBigQueryViewConfig(
                bq_view_namespace=self.mock_bq_view_namespace,
                view=self.second_view_builder.build(),
                view_filter_clause=" WHERE state_code = 'US_XX'",
                intermediate_table_name=
                f"{self.second_view_builder.view_id}_table_US_XX",
                output_directory=GcsfsDirectoryPath.from_absolute_path(
                    "gs://{project_id}-dataset-location/subdirectory/{state_code}"
                    .format(
                        project_id=self.mock_project_id,
                        state_code="US_XX",
                    )),
                export_output_formats=[ExportOutputFormatType.METRIC],
            ),
        ]

        with self.assertRaises(ValueError):
            exporter.export(view_export_configs)
예제 #26
0
    def setUp(self) -> None:
        self.metadata_patcher = patch('recidiviz.utils.metadata.project_id')
        self.mock_project_id_fn = self.metadata_patcher.start()
        self.mock_project_id_fn.return_value = 'project-id'

        metric_view_one = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view1',
            view_query_template='select * from table',
            dimensions=['a', 'b', 'c'],
        ).build()

        export_config_one_staging = ExportBigQueryViewConfig(
            view=metric_view_one,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket1/staging/US_XX'),
        )

        metric_view_two = MetricBigQueryViewBuilder(
            dataset_id='dataset',
            view_id='view2',
            view_query_template='select * from view2',
            dimensions=['d', 'e', 'f'],
        ).build()

        export_config_two_staging = ExportBigQueryViewConfig(
            view=metric_view_two,
            view_filter_clause='WHERE state_code = \'US_XX\'',
            intermediate_table_name='intermediate_table2',
            output_directory=GcsfsDirectoryPath.from_absolute_path(
                'gs://bucket2/staging/US_XX'),
        )

        self.staging_paths = [
            export_config_one_staging.output_path('txt'),
            export_config_two_staging.output_path('txt')
        ]