def __init__( self, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.project_id = project_id self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id, )) self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}" f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__( self, region_code: str, dry_run: bool, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.dry_run = dry_run self.project_id = 'recidiviz-123' self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_' f'{datetime.datetime.now().isoformat()}.txt') self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[str] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path) ) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY )
def __init__( self, region_code: str, file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, ): self.file_type = file_type self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-123" ) ) self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-staging" ) ) self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def test_get_configs_for_export_name( self, mock_environment: mock.MagicMock) -> None: """Tests get_configs_for_export_name function to ensure that export names correctly match""" mock_environment.return_value = "production" export_configs_for_filter = view_export_manager.get_configs_for_export_name( export_name=self.mock_export_name, state_code=self.mock_state_code, project_id=self.mock_project_id, ) view = self.mock_view_builder.build() metric_view = self.mock_metric_view_builder.build() expected_view_config_list = [ ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=view, view_filter_clause= f" WHERE state_code = '{self.mock_state_code}'", intermediate_table_name= f"{view.view_id}_table_{self.mock_state_code}", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code=self.mock_state_code, )), export_output_formats=[ExportOutputFormatType.JSON], ), ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=metric_view, view_filter_clause= f" WHERE state_code = '{self.mock_state_code}'", intermediate_table_name= f"{view.view_id}_table_{self.mock_state_code}", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code=self.mock_state_code, )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ), ] self.assertEqual(expected_view_config_list, export_configs_for_filter) # Test for case insensitivity export_configs_for_filter = view_export_manager.get_configs_for_export_name( export_name=self.mock_export_name.lower(), state_code=self.mock_state_code.lower(), project_id=self.mock_project_id, ) self.assertEqual(expected_view_config_list, export_configs_for_filter)
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def setUp(self) -> None: self.mock_bq_client = mock.create_autospec(BigQueryClient) self.mock_validator = mock.create_autospec(BigQueryViewExportValidator) self.mock_project_id = "fake-project" self.metadata_patcher = mock.patch( "recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = self.mock_project_id self.view_builder = SimpleBigQueryViewBuilder( dataset_id="test_dataset", view_id="test_view", view_query_template="SELECT NULL LIMIT 0", ) self.second_view_builder = SimpleBigQueryViewBuilder( dataset_id="test_dataset", view_id="test_view_2", view_query_template="SELECT NULL LIMIT 0", ) self.view_export_configs = [ ExportBigQueryViewConfig( view=self.view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.HEADERLESS_CSV, ], ), ExportBigQueryViewConfig( view=self.second_view_builder.build(), view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name= f"{self.second_view_builder.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.CSV, ], ), ]
def test_export_dashboard_data_to_cloud_storage( self, mock_view_exporter, mock_view_update_manager_rematerialize) -> None: """Tests the table is created from the view and then extracted.""" view_export_manager.export_view_data_to_cloud_storage( self.mock_state_code, mock_view_exporter) view = self.mock_view_builder.build() metric_view = self.mock_metric_view_builder.build() view_export_configs = [ ExportBigQueryViewConfig( view=view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ExportOutputFormatType.JSON], ), ExportBigQueryViewConfig( view=metric_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}" .format( project_id=self.mock_project_id, state_code="US_XX", )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ), ] mock_view_update_manager_rematerialize.assert_called() mock_view_exporter.export_and_validate.assert_has_calls( [ mock.call([]), # CSV export mock.call([ view_export_configs[1].pointed_to_staging_subdirectory() ]), # JSON export mock.call([ conf.pointed_to_staging_subdirectory() for conf in view_export_configs ]), # METRIC export ], any_order=True, )
def setUp(self) -> None: self.project_id = "recidiviz-456" self.project_id_patcher = patch("recidiviz.utils.metadata.project_id") self.project_id_patcher.start().return_value = self.project_id self.test_region = fake_region( region_code="us_xx", are_raw_data_bq_imports_enabled_in_env=True) self.region_module_patcher = patch.object( direct_ingest_raw_table_migration_collector, "regions", new=controller_fixtures, ) self.region_module_patcher.start() self.fs = DirectIngestGCSFileSystem(FakeGCSFileSystem()) self.ingest_directory_path = GcsfsDirectoryPath( bucket_name="direct/controllers/fixtures") self.temp_output_path = GcsfsDirectoryPath(bucket_name="temp_bucket") self.region_raw_file_config = DirectIngestRegionRawFileConfig( region_code="us_xx", yaml_config_file_dir=fixtures.as_filepath("us_xx"), ) self.mock_big_query_client = create_autospec(BigQueryClient) self.num_lines_uploaded = 0 self.mock_big_query_client.insert_into_table_from_cloud_storage_async.side_effect = ( self.mock_import_raw_file_to_big_query) self.import_manager = DirectIngestRawFileImportManager( region=self.test_region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_path, region_raw_file_config=self.region_raw_file_config, big_query_client=self.mock_big_query_client, ) self.import_manager.csv_reader = _TestSafeGcsCsvReader( self.fs.gcs_file_system) self.time_patcher = patch( "recidiviz.ingest.direct.controllers.direct_ingest_raw_file_import_manager.time" ) self.mock_time = self.time_patcher.start() def fake_get_dataset_ref(dataset_id: str) -> bigquery.DatasetReference: return bigquery.DatasetReference(project=self.project_id, dataset_id=dataset_id) self.mock_big_query_client.dataset_ref_for_id = fake_get_dataset_ref
def __init__( self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if ( self.file_type_to_move != self.destination_file_type and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED ): raise ValueError( "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED" ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def test_metric_export_state_agnostic(self): """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the export is state-agnostic.""" state_agnostic_dataset_export_config = ExportMetricDatasetConfig( dataset_id='dataset_id', metric_view_builders_to_export=self.views_for_dataset, output_directory_uri_template= "gs://{project_id}-bucket-without-state-codes", state_code_filter=None, export_name=None) view_configs_to_export = state_agnostic_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportMetricBigQueryViewConfig( view=expected_view, view_filter_clause=None, intermediate_table_name=f"{expected_view.view_id}_table", output_directory=GcsfsDirectoryPath.from_absolute_path( state_agnostic_dataset_export_config. output_directory_uri_template.format( project_id=self.mock_project_id, ))) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_metric_export_state_specific(self): """Tests the export_configs_for_views_to_export function on the ExportMetricDatasetConfig class when the export is state-specific.""" specific_state_dataset_export_config = ExportMetricDatasetConfig( dataset_id='dataset_id', metric_view_builders_to_export=self.views_for_dataset, output_directory_uri_template="gs://{project_id}-bucket", state_code_filter='US_XX', export_name=None) view_configs_to_export = specific_state_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportMetricBigQueryViewConfig( view=expected_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{expected_view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( f"gs://{self.mock_project_id}-bucket/US_XX")) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_metric_export_lantern_dashboard(self) -> None: """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the export is state-agnostic.""" lantern_dashboard_dataset_export_config = ExportViewCollectionConfig( view_builders_to_export=self.views_for_dataset, output_directory_uri_template= "gs://{project_id}-bucket-without-state-codes", export_name="TEST_EXPORT", bq_view_namespace=self.mock_big_query_view_namespace, ) view_configs_to_export = (lantern_dashboard_dataset_export_config. export_configs_for_views_to_export( project_id=self.mock_project_id, )) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportBigQueryViewConfig( bq_view_namespace=self.mock_big_query_view_namespace, view=expected_view, view_filter_clause=None, intermediate_table_name=f"{expected_view.view_id}_table", output_directory=GcsfsDirectoryPath.from_absolute_path( lantern_dashboard_dataset_export_config. output_directory_uri_template.format( project_id=self.mock_project_id, )), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def test_export_dashboard_data_to_cloud_storage_validation_error(self, mock_view_exporter, mock_view_update_manager): """Tests the table is created from the view and then extracted.""" mock_view_exporter.export_and_validate.side_effect = ViewExportValidationError # Should not throw metric_view_export_manager.export_view_data_to_cloud_storage(mock_state_code, mock_view_exporter) view = self.mock_view_builder.build() view_export_configs = [ExportMetricBigQueryViewConfig( view=view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://{project_id}-dataset-location/subdirectory/{state_code}".format( project_id=self.mock_project_id, state_code='US_XX', ) ) )] mock_view_update_manager.assert_called() mock_view_exporter.export_and_validate.assert_called_with(view_export_configs)
def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]: """Returns the appropriate paths to upload and the proper associated timestamp that it is to be normalized with. Skips any files that are not properly supported.""" path_candidates = [] for path, timestamp in self.paths_with_timestamps: if self.gcsfs.is_dir(path): directory = GcsfsDirectoryPath.from_absolute_path(path) files_in_directory = self.gcsfs.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path, ) for file in files_in_directory: if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) elif self.gcsfs.is_file(path): file = GcsfsFilePath.from_absolute_path(path) if self._is_supported_extension(file.abs_path()): path_candidates.append((file.abs_path(), timestamp)) else: self.skipped_files.append(file.abs_path()) else: logging.warning( "Could not indicate %s as a directory or a file in %s. Skipping", path, self.destination_ingest_bucket.uri(), ) self.unable_to_upload_files.append(path) continue return path_candidates
def gcsfs_direct_ingest_storage_directory_path_for_region( *, region_code: str, system_level: SystemLevel, ingest_instance: DirectIngestInstance, file_type: Optional[GcsfsDirectIngestFileType] = None, project_id: Optional[str] = None, ) -> GcsfsDirectoryPath: if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") suffix = bucket_suffix_for_ingest_instance(ingest_instance) bucket_name = build_ingest_storage_bucket_name( project_id=project_id, system_level_str=system_level.value.lower(), suffix=suffix, ) storage_bucket = GcsfsBucketPath(bucket_name) if file_type is not None: subdir = os.path.join(region_code.lower(), file_type.value) else: subdir = region_code.lower() return GcsfsDirectoryPath.from_dir_and_subdir(storage_bucket, subdir)
def test_metric_export_lantern_dashboard_with_state(self): """Tests the export_configs_for_views_to_export function on the ExportViewCollectionConfig class when the export is state-specific.""" lantern_dashboard_with_state_dataset_export_config = ExportViewCollectionConfig( view_builders_to_export=self.views_for_dataset, output_directory_uri_template="gs://{project_id}-bucket", state_code_filter="US_XX", export_name="TEST_EXPORT", bq_view_namespace=self.mock_big_query_view_namespace, ) view_configs_to_export = lantern_dashboard_with_state_dataset_export_config.export_configs_for_views_to_export( project_id=self.mock_project_id ) expected_view = self.mock_view_builder.build() expected_view_export_configs = [ ExportBigQueryViewConfig( view=expected_view, view_filter_clause=" WHERE state_code = 'US_XX'", intermediate_table_name=f"{expected_view.view_id}_table_US_XX", output_directory=GcsfsDirectoryPath.from_absolute_path( f"gs://{self.mock_project_id}-bucket/US_XX" ), export_output_formats=[ ExportOutputFormatType.JSON, ExportOutputFormatType.METRIC, ], ) ] self.assertEqual(expected_view_export_configs, view_configs_to_export)
def export_configs_for_views_to_export( self, project_id: str) -> Sequence[ExportBigQueryViewConfig]: """Builds a list of ExportBigQueryViewConfig that define how all views in view_builders_to_export should be exported to Google Cloud Storage.""" view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'" if self.state_code_filter else None) intermediate_table_name = "{export_view_name}_table" output_directory = self.output_directory_uri_template.format( project_id=project_id) if self.state_code_filter: intermediate_table_name += f"_{self.state_code_filter}" output_directory += f"/{self.state_code_filter}" configs = [] for vb in self.view_builders_to_export: view = vb.build() optional_args = {} if self.export_output_formats is not None: optional_args[ "export_output_formats"] = self.export_output_formats configs.append( ExportBigQueryViewConfig( view=view, view_filter_clause=view_filter_clause, intermediate_table_name=intermediate_table_name.format( export_view_name=view.view_id), output_directory=GcsfsDirectoryPath.from_absolute_path( output_directory), **optional_args, )) return configs
def create_export_manager( self, region: Region, is_detect_row_deletion_view: bool = False, materialize_raw_data_table_views: bool = False, controller_file_tags: Optional[List[str]] = None, ) -> DirectIngestIngestViewExportManager: metadata_manager = PostgresDirectIngestFileMetadataManager( region.region_code) controller_file_tags = (["ingest_view"] if controller_file_tags is None else controller_file_tags) return DirectIngestIngestViewExportManager( region=region, fs=FakeGCSFileSystem(), ingest_directory_path=GcsfsDirectoryPath.from_absolute_path( "ingest_bucket"), big_query_client=self.mock_client, file_metadata_manager=metadata_manager, view_collector=_ViewCollector( # type: ignore[arg-type] region, controller_file_tags=controller_file_tags, is_detect_row_deletion_view=is_detect_row_deletion_view, materialize_raw_data_table_views= materialize_raw_data_table_views, ), launched_file_tags=controller_file_tags, )
def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str]): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}' f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def export_configs_for_views_to_export(self, project_id: str) -> Sequence[ExportMetricBigQueryViewConfig]: """Builds a list of ExportMetricBigQueryViewConfigs that define how all metric views in metric_view_builders_to_export should be exported to Google Cloud Storage.""" view_filter_clause = (f" WHERE state_code = '{self.state_code_filter}'" if self.state_code_filter else None) intermediate_table_name = "{export_view_name}_table" output_directory = self.output_directory_uri_template.format( project_id=project_id ) if self.state_code_filter: intermediate_table_name += f"_{self.state_code_filter}" output_directory += f"/{self.state_code_filter}" return [ ExportMetricBigQueryViewConfig( view=view, view_filter_clause=view_filter_clause, intermediate_table_name=intermediate_table_name.format( export_view_name=view.view_id ), output_directory=GcsfsDirectoryPath.from_absolute_path(output_directory), ) for view in [vb.build() for vb in self.metric_view_builders_to_export] ]
def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[GcsfsDirectoryPath] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.skipped_files: List[str] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = (gcsfs_sftp_download_bucket_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) if gcs_destination_path is None else gcs_destination_path) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY) self.postgres_direct_ingest_file_metadata_manager = ( PostgresDirectIngestRawFileMetadataManager( region, DirectIngestInstance.PRIMARY.database_version( SystemLevel.STATE, state_code=StateCode(self.region.upper())).name, ))
def build_path(bucket_template: str, state: str, pdf_name: str) -> GcsfsFilePath: return GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(bucket_template.format(metadata.project_id()), state), pdf_name, )
def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def get_paths_to_upload(self) -> List[Tuple[str, datetime.datetime]]: """Returns the appropriate paths to upload and the proper associated timestamp that it is to be normalized with. Skips any files that are not properly supported.""" path_candidates = [] for path, timestamp in self.paths_with_timestamps: if self.gcsfs.is_dir(path): directory = GcsfsDirectoryPath.from_absolute_path(path) files_in_directory = self.gcsfs.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path, ) for file in files_in_directory: path_candidates.append((file.abs_path(), timestamp)) elif self.gcsfs.is_file(path): file = GcsfsFilePath.from_absolute_path(path) path_candidates.append((file.abs_path(), timestamp)) else: logging.warning( "Could not indicate %s as a directory or a file in %s. Skipping", path, self.gcs_destination_path.uri(), ) self.unable_to_upload_files.append(path) continue result = [] for path, timestamp in path_candidates: _, ext = os.path.splitext(path) if not ext or ext not in self.SUPPORTED_EXTENSIONS: logging.info("Skipping file [%s] - invalid extension %s", path, ext) continue result.append((path, timestamp)) return result
def setUp(self) -> None: self.metadata_patcher = patch("recidiviz.utils.metadata.project_id") self.mock_project_id_fn = self.metadata_patcher.start() self.mock_project_id_fn.return_value = "project-id" self.mock_bq_view_namespace = BigQueryViewNamespace.STATE metric_view_one = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view1", description="view1 description", view_query_template="select * from table", dimensions=("a", "b", "c"), ).build() export_config_one_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_one, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket1/staging/US_XX"), ) metric_view_two = MetricBigQueryViewBuilder( dataset_id="dataset", view_id="view2", description="view2 description", view_query_template="select * from view2", dimensions=("d", "e", "f"), ).build() export_config_two_staging = ExportBigQueryViewConfig( bq_view_namespace=self.mock_bq_view_namespace, view=metric_view_two, view_filter_clause="WHERE state_code = 'US_XX'", intermediate_table_name="intermediate_table2", output_directory=GcsfsDirectoryPath.from_absolute_path( "gs://bucket2/staging/US_XX"), ) self.staging_paths = [ export_config_one_staging.output_path("txt"), export_config_two_staging.output_path("txt"), ]
def is_dir(self, path: str) -> bool: try: directory = GcsfsDirectoryPath.from_absolute_path(path) has_dir = self.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path) return len(has_dir) > 0 except ValueError: return False
def state_aggregate() -> Tuple[str, HTTPStatus]: """Calls state aggregates""" bucket = get_str_param_value("bucket", request.args) state = get_str_param_value("state", request.args) filename = get_str_param_value("filename", request.args) project_id = metadata.project_id() logging.info("The project id is %s", project_id) if not bucket or not state or not filename: raise StateAggregateError("All of state, bucket, and filename must be provided") directory_path = GcsfsDirectoryPath(bucket, state) path = GcsfsFilePath.from_directory_and_file_name(directory_path, filename) parser = STATE_TO_PARSER[state] fs = GcsfsFactory.build() logging.info("The path to download from is %s", path) logging.info("The files in the directory are:") logging.info( fs.ls_with_blob_prefix( bucket_name=directory_path.bucket_name, blob_prefix=directory_path.relative_path, ) ) # Providing a stream buffer to tabula reader does not work because it # tries to load the file into the local filesystem, since appengine is a # read only filesystem (except for the tmpdir) we download the file into # the local tmpdir and pass that in. handle = fs.download_to_temp_file(path) if not handle: raise StateAggregateError(f"Unable to download file: {path}") logging.info("Successfully downloaded file from gcs: %s", handle.local_file_path) result = parser(handle.local_file_path) logging.info("Successfully parsed the report") for table, df in result.items(): dao.write_df(table, df) # If we are successful, we want to move the file out of the cloud # function triggered directory, and into the historical path. historical_path = GcsfsFilePath.from_directory_and_file_name( GcsfsDirectoryPath(HISTORICAL_BUCKET.format(project_id), state), filename ) fs.mv(path, historical_path) return "", HTTPStatus.OK
def is_dir(self, path: str) -> bool: try: directory = GcsfsDirectoryPath.from_absolute_path(path) # If the directory is empty, has_dir will have 1 entry, which is the Blob representing the directory # Otherwise, if the directory doesn't exist on GCS, has_dir will return an empty list has_dir = self.ls_with_blob_prefix( bucket_name=directory.bucket_name, blob_prefix=directory.relative_path) return len(has_dir) > 0 except ValueError: return False
def __init__( self, paths_with_timestamps: List[Tuple[str, datetime.datetime]], project_id: str, region: str, gcs_destination_path: Optional[str] = None, ): self.paths_with_timestamps = paths_with_timestamps self.project_id = project_id self.region = region.lower() self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.gcs_destination_path = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)) self.uploaded_files: List[str] = [] self.unable_to_upload_files: List[str] = []