def __init__( self, region_code: str, file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, ): self.file_type = file_type self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-123" ) ) self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id="recidiviz-staging" ) ) self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def __init__( self, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.project_id = project_id self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id, )) self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}" f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def main() -> None: """Executes the main flow of the script.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--region", required=True, help="E.g. 'us_nd'") parser.add_argument( "--project-id", choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION], help= "Used to select which GCP project against which to run this script.", required=True, ) parser.add_argument( "--dry-run", default=True, type=str_to_bool, help= "Runs copy in dry-run mode, only prints the file copies it would do.", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(message)s") source_region_storage_dir_path = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_code=args.region, system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.SECONDARY, project_id=args.project_id, )) destination_region_storage_dir_path = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_code=args.region, system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.PRIMARY, project_id=args.project_id, )) CopyStorageIngestFilesController( region_code=args.region, source_region_storage_dir_path=source_region_storage_dir_path, destination_region_storage_dir_path=destination_region_storage_dir_path, file_type_to_copy=GcsfsDirectIngestFileType.INGEST_VIEW, start_date_bound=None, end_date_bound=None, dry_run=args.dry_run, ).run()
def __init__(self, project_id: str, region: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str]): self.project_id = project_id self.region = region self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) self.ingest_bucket = gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt' )
def test_get_state_storage_directory_path_file_type_raw(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( "us_nd", SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA), "recidiviz-staging-direct-ingest-state-storage/us_nd/raw", )
def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str]): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, self.file_type, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}' f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__( self, region_code: str, dry_run: bool, ): self.region_code = region_code self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED self.dry_run = dry_run self.project_id = 'recidiviz-123' self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region_code, SystemLevel.STATE, project_id=self.project_id)) self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, GcsfsDirectIngestFileType.RAW_DATA, project_id=self.project_id)) self.log_output_path = os.path.join( os.path.dirname(__file__), f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_' f'{datetime.datetime.now().isoformat()}.txt') self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list()) self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None): super().__init__(region_name, system_level) self.fs = GcsfsFactory.build() if ingest_directory_path: self.ingest_directory_path = ingest_directory_path else: self.ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) if storage_directory_path: self.storage_directory_path = storage_directory_path else: self.storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self._get_file_tag_rank_list())
def __init__(self, region_name: str, system_level: SystemLevel, ingest_directory_path: Optional[str] = None, storage_directory_path: Optional[str] = None, max_delay_sec_between_files: Optional[int] = None): super().__init__(region_name, system_level) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.max_delay_sec_between_files = max_delay_sec_between_files if not ingest_directory_path: ingest_directory_path = \ gcsfs_direct_ingest_directory_path_for_region(region_name, system_level) self.ingest_directory_path = \ GcsfsDirectoryPath.from_absolute_path(ingest_directory_path) if not storage_directory_path: storage_directory_path = \ gcsfs_direct_ingest_storage_directory_path_for_region( region_name, system_level) self.storage_directory_path = \ GcsfsDirectoryPath.from_absolute_path(storage_directory_path) self.temp_output_directory_path = \ GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path()) ingest_job_file_type_filter = \ GcsfsDirectIngestFileType.INGEST_VIEW \ if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None self.file_prioritizer = \ GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_directory_path, self.get_file_tag_rank_list(), ingest_job_file_type_filter) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl()) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, ingest_directory_path=self.ingest_directory_path, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()))
def test_get_county_storage_directory_path_secondary(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( region_code="us_tx_brazos", system_level=SystemLevel.COUNTY, ingest_instance=DirectIngestInstance.SECONDARY, ).abs_path(), "recidiviz-123-direct-ingest-county-storage-secondary/us_tx_brazos", )
def __init__(self, region_code: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool): self.prod_storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-123') self.staging_storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region_code, SystemLevel.STATE, project_id='recidiviz-staging') self.dry_run = dry_run self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.log_output_path = os.path.join( os.path.dirname(__file__), f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_' f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt' ) self.mutex = threading.Lock() self.copy_list: List[Tuple[str, str]] = [] self.copy_progress: Optional[Bar] = None
def test_get_state_storage_directory_path_secondary(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( region_code="us_nd", system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.SECONDARY, ).abs_path(), "recidiviz-staging-direct-ingest-state-storage-secondary/us_nd", )
def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None: """Initialize the controller.""" self.cloud_task_manager = DirectIngestCloudTaskManagerImpl() self.ingest_instance = DirectIngestInstance.for_ingest_bucket( ingest_bucket_path) self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest( region_code=self.region.region_code, schema_type=self.system_level.schema_type(), ingest_instance=self.ingest_instance, ) self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.ingest_bucket_path = ingest_bucket_path self.storage_directory_path = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_code=self.region_code(), system_level=self.system_level, ingest_instance=self.ingest_instance, )) self.temp_output_directory_path = ( gcsfs_direct_ingest_temporary_output_directory_path()) self.file_prioritizer = GcsfsDirectIngestJobPrioritizer( self.fs, self.ingest_bucket_path, self.get_file_tag_rank_list(), ) self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT self.file_metadata_manager = PostgresDirectIngestFileMetadataManager( region_code=self.region.region_code, ingest_database_name=self.ingest_database_key.db_name, ) self.raw_file_import_manager = DirectIngestRawFileImportManager( region=self.region, fs=self.fs, ingest_bucket_path=self.ingest_bucket_path, temp_output_directory_path=self.temp_output_directory_path, big_query_client=BigQueryClientImpl(), ) self.ingest_view_export_manager = DirectIngestIngestViewExportManager( region=self.region, fs=self.fs, output_bucket_name=self.ingest_bucket_path.bucket_name, file_metadata_manager=self.file_metadata_manager, big_query_client=BigQueryClientImpl(), view_collector=DirectIngestPreProcessedIngestViewCollector( self.region, self.get_file_tag_rank_list()), launched_file_tags=self.get_file_tag_rank_list(), ) self.ingest_instance_status_manager = DirectIngestInstanceStatusManager( self.region_code(), self.ingest_instance)
def test_get_county_storage_directory_path_raw(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( region_code="us_tx_brazos", system_level=SystemLevel.COUNTY, ingest_instance=DirectIngestInstance.PRIMARY, file_type=GcsfsDirectIngestFileType.RAW_DATA, ).abs_path(), "recidiviz-123-direct-ingest-county-storage/us_tx_brazos/raw", )
def __init__(self, paths: str, project_id: str, region: str, date: str): self.paths = paths self.project_id = project_id self.region = region.lower() self.date = date self.storage_bucket = \ gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)
def direct_ingest_storage_directory(self) -> GcsfsDirectoryPath: if in_gcp(): return gcsfs_direct_ingest_storage_directory_path_for_region( region_code=self.region_code, system_level=SystemLevel.STATE, ingest_instance=DirectIngestInstance.PRIMARY, ) # Local override return GcsfsDirectoryPath.from_absolute_path( f"recidiviz-staging-direct-ingest-state-storage/{self.region_code.lower()}" )
def __init__(self, paths: str, project_id: str, region: str, date: str, dry_run: bool): self.paths = paths self.project_id = project_id self.region = region.lower() self.datetime = datetime.datetime.fromisoformat(date) self.dry_run = dry_run self.storage_bucket = \ gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id)
def __init__( self, project_id: str, region: str, file_type_to_move: GcsfsDirectIngestFileType, destination_file_type: GcsfsDirectIngestFileType, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.file_type_to_move = file_type_to_move self.destination_file_type = destination_file_type if ( self.file_type_to_move != self.destination_file_type and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED ): raise ValueError( "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED" ) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_storage_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def __init__( self, *, file_type: GcsfsDirectIngestFileType, region_code: str, ingest_instance: DirectIngestInstance, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, project_id: str, file_filter: Optional[str], ): self.file_type = file_type self.region_code = region_code self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.project_id = project_id if ( self.file_type == GcsfsDirectIngestFileType.RAW_DATA and ingest_instance != DirectIngestInstance.PRIMARY ): raise ValueError( f"Raw files are only ever handled in the PRIMARY ingest instance. " f"Instead, found ingest_instance [{ingest_instance}]." ) self.region_storage_dir_path_for_file_type = ( gcsfs_direct_ingest_storage_directory_path_for_region( region_code=region_code, system_level=SystemLevel.STATE, ingest_instance=ingest_instance, file_type=self.file_type, project_id=self.project_id, ) ) self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}" f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt", ) self.mutex = threading.Lock() self.move_list: List[Tuple[str, str]] = [] self.move_progress: Optional[Bar] = None
def __init__( self, project_id: str, region: str, start_date_bound: Optional[str], end_date_bound: Optional[str], dry_run: bool, file_filter: Optional[str], ): self.project_id = project_id self.region = region self.state_code = StateCode(region.upper()) self.start_date_bound = start_date_bound self.end_date_bound = end_date_bound self.dry_run = dry_run self.file_filter = file_filter self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region( region_code=region, system_level=SystemLevel.STATE, # Raw files are only ever stored in the PRIMARY storage bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=self.project_id, ) self.ingest_bucket = gcsfs_direct_ingest_bucket_for_region( region_code=region, system_level=SystemLevel.STATE, # Raw files are only ever processed in the PRIMARY ingest bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=self.project_id, ) self.mutex = threading.Lock() self.collect_progress: Optional[Bar] = None self.move_progress: Optional[Bar] = None self.moves_list: List[Tuple[str, str]] = [] self.log_output_path = os.path.join( os.path.dirname(__file__), f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_" f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt", )
def test_get_state_storage_directory_path(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( "us_nd", SystemLevel.STATE), "recidiviz-staging-direct-ingest-state-storage/us_nd", )
def test_get_county_storage_directory_path(self) -> None: self.assertEqual( gcsfs_direct_ingest_storage_directory_path_for_region( "us_tx_brazos", SystemLevel.COUNTY), "recidiviz-123-direct-ingest-county-storage/us_tx_brazos", )
def get_ingest_instance_summaries( self, state_code: StateCode) -> List[Dict[str, Any]]: """Returns a list of dictionaries containing the following info for a given instance: i.e. { instance: the direct ingest instance, dbName: database name for this instance, storage: storage bucket absolute path, ingest: { name: bucket_name, unprocessedFilesRaw: how many unprocessed raw data files in the bucket, processedFilesRaw: how many processed raw data files are in the bucket (should be zero), unprocessedFilesIngestView: how many unprocessed ingest view files in the bucket, processedFilesIngestView: how many processed ingest view files are in the bucket (should be zero), }, operations: { unprocessedFilesRaw: number of unprocessed raw files in the operations database unprocessedFilesIngestView: number of unprocessed ingest view files in the operations database dateOfEarliestUnprocessedIngestView: date of earliest unprocessed ingest file, if it exists } } """ formatted_state_code = state_code.value.lower() ingest_instance_summaries: List[Dict[str, Any]] = [] for instance in DirectIngestInstance: # Get the ingest bucket path ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region( region_code=formatted_state_code, system_level=SystemLevel.STATE, ingest_instance=instance, project_id=self.project_id, ) # Get an object containing information about the ingest bucket ingest_bucket_metadata = self._get_bucket_metadata( ingest_bucket_path) # Get the storage bucket for this instance storage_bucket_path = gcsfs_direct_ingest_storage_directory_path_for_region( region_code=formatted_state_code, system_level=SystemLevel.STATE, ingest_instance=instance, project_id=self.project_id, ) # Get the database name corresponding to this instance ingest_db_name = self._get_database_name_for_state( state_code, instance) # Get the operations metadata for this ingest instance operations_db_metadata = self._get_operations_db_metadata( state_code, ingest_db_name) ingest_instance_summary: Dict[str, Any] = { "instance": instance.value, "storage": storage_bucket_path.abs_path(), "ingest": ingest_bucket_metadata, "dbName": ingest_db_name, "operations": operations_db_metadata, } ingest_instance_summaries.append(ingest_instance_summary) return ingest_instance_summaries
def main() -> None: """Executes the main flow of the script.""" parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--region", required=True, help="E.g. 'us_nd'") parser.add_argument( "--source-project-id", choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION], help= "Used to select which GCP project against which to run this script.", required=True, ) parser.add_argument( "--destination-project-id", choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION], help= "Used to select which GCP project against which to run this script.", required=True, ) parser.add_argument( "--dry-run", default=True, type=str_to_bool, help= "Runs copy in dry-run mode, only prints the file copies it would do.", ) parser.add_argument( "--start-date-bound", help= "The lower bound date to start from, inclusive. For partial copying of ingested files. " "E.g. 2019-09-23.", ) parser.add_argument( "--end-date-bound", help= "The upper bound date to end at, inclusive. For partial copying of ingested files. " "E.g. 2019-09-23.", ) args = parser.parse_args() logging.basicConfig(level=logging.INFO, format="%(message)s") source_region_storage_dir_path = gcsfs_direct_ingest_storage_directory_path_for_region( region_code=args.region, system_level=SystemLevel.STATE, # Raw files are only ever stored in the PRIMARY storage bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=args.source_project_id, ) destination_region_storage_dir_path = gcsfs_direct_ingest_storage_directory_path_for_region( region_code=args.region, system_level=SystemLevel.STATE, # Raw files are only ever stored in the PRIMARY storage bucket ingest_instance=DirectIngestInstance.PRIMARY, project_id=args.destination_project_id, ) CopyStorageIngestFilesController( region_code=args.region, source_region_storage_dir_path=source_region_storage_dir_path, destination_region_storage_dir_path=destination_region_storage_dir_path, file_type_to_copy=GcsfsDirectIngestFileType.RAW_DATA, start_date_bound=args.start_date_bound, end_date_bound=args.end_date_bound, dry_run=args.dry_run, ).run()