def __init__(
        self,
        region_code: str,
        file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
    ):
        self.file_type = file_type
        self.prod_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-123"
            )
        )
        self.staging_region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code, SystemLevel.STATE, project_id="recidiviz-staging"
            )
        )
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
 def __init__(
     self,
     region_code: str,
     start_date_bound: Optional[str],
     end_date_bound: Optional[str],
     dry_run: bool,
     project_id: str,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.project_id = project_id
     self.region_storage_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id,
         ))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f"move_storage_files_from_unspecified_to_raw_start_bound_{self.region_code}_region_{self.start_date_bound}"
         f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
Пример #3
0
def main() -> None:
    """Executes the main flow of the script."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--region", required=True, help="E.g. 'us_nd'")

    parser.add_argument(
        "--project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )

    parser.add_argument(
        "--dry-run",
        default=True,
        type=str_to_bool,
        help=
        "Runs copy in dry-run mode, only prints the file copies it would do.",
    )

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO, format="%(message)s")

    source_region_storage_dir_path = (
        gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code=args.region,
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.SECONDARY,
            project_id=args.project_id,
        ))
    destination_region_storage_dir_path = (
        gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code=args.region,
            system_level=SystemLevel.STATE,
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=args.project_id,
        ))
    CopyStorageIngestFilesController(
        region_code=args.region,
        source_region_storage_dir_path=source_region_storage_dir_path,
        destination_region_storage_dir_path=destination_region_storage_dir_path,
        file_type_to_copy=GcsfsDirectIngestFileType.INGEST_VIEW,
        start_date_bound=None,
        end_date_bound=None,
        dry_run=args.dry_run,
    ).run()
    def __init__(self, project_id: str, region: str,
                 start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool,
                 file_filter: Optional[str]):

        self.project_id = project_id
        self.region = region
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id)
        self.ingest_bucket = gcsfs_direct_ingest_directory_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id)

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
Пример #5
0
 def test_get_state_storage_directory_path_file_type_raw(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             "us_nd", SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA),
         "recidiviz-staging-direct-ingest-state-storage/us_nd/raw",
     )
 def __init__(self, file_type: GcsfsDirectIngestFileType, region_code: str,
              start_date_bound: Optional[str],
              end_date_bound: Optional[str], dry_run: bool, project_id: str,
              file_filter: Optional[str]):
     self.file_type = file_type
     self.region_code = region_code
     self.start_date_bound = start_date_bound
     self.end_date_bound = end_date_bound
     self.dry_run = dry_run
     self.file_filter = file_filter
     self.project_id = project_id
     self.region_storage_dir_path_for_file_type = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             self.file_type,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}'
         f'_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
     )
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
 def __init__(
     self,
     region_code: str,
     dry_run: bool,
 ):
     self.region_code = region_code
     self.file_type = GcsfsDirectIngestFileType.UNSPECIFIED
     self.dry_run = dry_run
     self.project_id = 'recidiviz-123'
     self.region_ingest_bucket_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_directory_path_for_region(
             region_code, SystemLevel.STATE, project_id=self.project_id))
     self.region_storage_raw_dir_path = GcsfsDirectoryPath.from_absolute_path(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code,
             SystemLevel.STATE,
             GcsfsDirectIngestFileType.RAW_DATA,
             project_id=self.project_id))
     self.log_output_path = os.path.join(
         os.path.dirname(__file__),
         f'move_prod_ingest_files_to_raw_start_bound_{self.region_code}_region_dry_run_{dry_run}_'
         f'{datetime.datetime.now().isoformat()}.txt')
     self.mutex = threading.Lock()
     self.move_list: List[Tuple[str, str]] = []
     self.move_progress: Optional[Bar] = None
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())

        self.file_split_line_limit = self._FILE_SPLIT_LINE_LIMIT
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None):
        super().__init__(region_name, system_level)
        self.fs = GcsfsFactory.build()

        if ingest_directory_path:
            self.ingest_directory_path = ingest_directory_path
        else:
            self.ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        if storage_directory_path:
            self.storage_directory_path = storage_directory_path
        else:
            self.storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self._get_file_tag_rank_list())
Пример #10
0
    def __init__(self,
                 region_name: str,
                 system_level: SystemLevel,
                 ingest_directory_path: Optional[str] = None,
                 storage_directory_path: Optional[str] = None,
                 max_delay_sec_between_files: Optional[int] = None):
        super().__init__(region_name, system_level)
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.max_delay_sec_between_files = max_delay_sec_between_files

        if not ingest_directory_path:
            ingest_directory_path = \
                gcsfs_direct_ingest_directory_path_for_region(region_name,
                                                              system_level)
        self.ingest_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(ingest_directory_path)

        if not storage_directory_path:
            storage_directory_path = \
                gcsfs_direct_ingest_storage_directory_path_for_region(
                    region_name, system_level)

        self.storage_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(storage_directory_path)

        self.temp_output_directory_path = \
            GcsfsDirectoryPath.from_absolute_path(gcsfs_direct_ingest_temporary_output_directory_path())

        ingest_job_file_type_filter = \
            GcsfsDirectIngestFileType.INGEST_VIEW \
            if self.region.is_raw_vs_ingest_file_name_detection_enabled() else None
        self.file_prioritizer = \
            GcsfsDirectIngestJobPrioritizer(
                self.fs,
                self.ingest_directory_path,
                self.get_file_tag_rank_list(),
                ingest_job_file_type_filter)

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code)

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl())

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            ingest_directory_path=self.ingest_directory_path,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()))
Пример #11
0
 def test_get_county_storage_directory_path_secondary(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code="us_tx_brazos",
             system_level=SystemLevel.COUNTY,
             ingest_instance=DirectIngestInstance.SECONDARY,
         ).abs_path(),
         "recidiviz-123-direct-ingest-county-storage-secondary/us_tx_brazos",
     )
    def __init__(self, region_code: str, start_date_bound: Optional[str],
                 end_date_bound: Optional[str], dry_run: bool):
        self.prod_storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code, SystemLevel.STATE, project_id='recidiviz-123')
        self.staging_storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code, SystemLevel.STATE, project_id='recidiviz-staging')
        self.dry_run = dry_run
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound

        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f'copy_prod_to_staging_result_{region_code}_start_bound_{self.start_date_bound}_end_bound_'
            f'{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt'
        )
        self.mutex = threading.Lock()
        self.copy_list: List[Tuple[str, str]] = []
        self.copy_progress: Optional[Bar] = None
Пример #13
0
 def test_get_state_storage_directory_path_secondary(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code="us_nd",
             system_level=SystemLevel.STATE,
             ingest_instance=DirectIngestInstance.SECONDARY,
         ).abs_path(),
         "recidiviz-staging-direct-ingest-state-storage-secondary/us_nd",
     )
Пример #14
0
    def __init__(self, ingest_bucket_path: GcsfsBucketPath) -> None:
        """Initialize the controller."""
        self.cloud_task_manager = DirectIngestCloudTaskManagerImpl()
        self.ingest_instance = DirectIngestInstance.for_ingest_bucket(
            ingest_bucket_path)
        self.region_lock_manager = DirectIngestRegionLockManager.for_direct_ingest(
            region_code=self.region.region_code,
            schema_type=self.system_level.schema_type(),
            ingest_instance=self.ingest_instance,
        )
        self.fs = DirectIngestGCSFileSystem(GcsfsFactory.build())
        self.ingest_bucket_path = ingest_bucket_path
        self.storage_directory_path = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code(),
                system_level=self.system_level,
                ingest_instance=self.ingest_instance,
            ))

        self.temp_output_directory_path = (
            gcsfs_direct_ingest_temporary_output_directory_path())

        self.file_prioritizer = GcsfsDirectIngestJobPrioritizer(
            self.fs,
            self.ingest_bucket_path,
            self.get_file_tag_rank_list(),
        )

        self.ingest_file_split_line_limit = self._INGEST_FILE_SPLIT_LINE_LIMIT

        self.file_metadata_manager = PostgresDirectIngestFileMetadataManager(
            region_code=self.region.region_code,
            ingest_database_name=self.ingest_database_key.db_name,
        )

        self.raw_file_import_manager = DirectIngestRawFileImportManager(
            region=self.region,
            fs=self.fs,
            ingest_bucket_path=self.ingest_bucket_path,
            temp_output_directory_path=self.temp_output_directory_path,
            big_query_client=BigQueryClientImpl(),
        )

        self.ingest_view_export_manager = DirectIngestIngestViewExportManager(
            region=self.region,
            fs=self.fs,
            output_bucket_name=self.ingest_bucket_path.bucket_name,
            file_metadata_manager=self.file_metadata_manager,
            big_query_client=BigQueryClientImpl(),
            view_collector=DirectIngestPreProcessedIngestViewCollector(
                self.region, self.get_file_tag_rank_list()),
            launched_file_tags=self.get_file_tag_rank_list(),
        )

        self.ingest_instance_status_manager = DirectIngestInstanceStatusManager(
            self.region_code(), self.ingest_instance)
Пример #15
0
 def test_get_county_storage_directory_path_raw(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             region_code="us_tx_brazos",
             system_level=SystemLevel.COUNTY,
             ingest_instance=DirectIngestInstance.PRIMARY,
             file_type=GcsfsDirectIngestFileType.RAW_DATA,
         ).abs_path(),
         "recidiviz-123-direct-ingest-county-storage/us_tx_brazos/raw",
     )
Пример #16
0
    def __init__(self, paths: str, project_id: str, region: str, date: str):

        self.paths = paths
        self.project_id = project_id
        self.region = region.lower()
        self.date = date

        self.storage_bucket = \
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region,
                SystemLevel.STATE,
                project_id=self.project_id)
Пример #17
0
    def direct_ingest_storage_directory(self) -> GcsfsDirectoryPath:
        if in_gcp():
            return gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=self.region_code,
                system_level=SystemLevel.STATE,
                ingest_instance=DirectIngestInstance.PRIMARY,
            )

        # Local override
        return GcsfsDirectoryPath.from_absolute_path(
            f"recidiviz-staging-direct-ingest-state-storage/{self.region_code.lower()}"
        )
    def __init__(self, paths: str, project_id: str, region: str, date: str,
                 dry_run: bool):

        self.paths = paths
        self.project_id = project_id
        self.region = region.lower()
        self.datetime = datetime.datetime.fromisoformat(date)
        self.dry_run = dry_run

        self.storage_bucket = \
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region,
                SystemLevel.STATE,
                project_id=self.project_id)
Пример #19
0
    def __init__(
        self,
        project_id: str,
        region: str,
        file_type_to_move: GcsfsDirectIngestFileType,
        destination_file_type: GcsfsDirectIngestFileType,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.file_type_to_move = file_type_to_move
        self.destination_file_type = destination_file_type

        if (
            self.file_type_to_move != self.destination_file_type
            and self.file_type_to_move != GcsfsDirectIngestFileType.UNSPECIFIED
        ):
            raise ValueError(
                "Args file_type_to_move and destination_file_type must match if type to move is UNSPECIFIED"
            )

        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )
        self.ingest_bucket = GcsfsDirectoryPath.from_absolute_path(
            gcsfs_direct_ingest_directory_path_for_region(
                region, SystemLevel.STATE, project_id=self.project_id
            )
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
Пример #20
0
    def __init__(
        self,
        *,
        file_type: GcsfsDirectIngestFileType,
        region_code: str,
        ingest_instance: DirectIngestInstance,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        project_id: str,
        file_filter: Optional[str],
    ):
        self.file_type = file_type
        self.region_code = region_code
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter
        self.project_id = project_id

        if (
            self.file_type == GcsfsDirectIngestFileType.RAW_DATA
            and ingest_instance != DirectIngestInstance.PRIMARY
        ):
            raise ValueError(
                f"Raw files are only ever handled in the PRIMARY ingest instance. "
                f"Instead, found ingest_instance [{ingest_instance}]."
            )

        self.region_storage_dir_path_for_file_type = (
            gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=region_code,
                system_level=SystemLevel.STATE,
                ingest_instance=ingest_instance,
                file_type=self.file_type,
                project_id=self.project_id,
            )
        )
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_storage_files_to_deprecated_start_bound_{self.region_code}_region_{self.start_date_bound}"
            f"_end_bound_{self.end_date_bound}_dry_run_{dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
        self.mutex = threading.Lock()
        self.move_list: List[Tuple[str, str]] = []
        self.move_progress: Optional[Bar] = None
Пример #21
0
    def __init__(
        self,
        project_id: str,
        region: str,
        start_date_bound: Optional[str],
        end_date_bound: Optional[str],
        dry_run: bool,
        file_filter: Optional[str],
    ):

        self.project_id = project_id
        self.region = region
        self.state_code = StateCode(region.upper())
        self.start_date_bound = start_date_bound
        self.end_date_bound = end_date_bound
        self.dry_run = dry_run
        self.file_filter = file_filter

        self.storage_bucket = gcsfs_direct_ingest_storage_directory_path_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever stored in the PRIMARY storage bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )
        self.ingest_bucket = gcsfs_direct_ingest_bucket_for_region(
            region_code=region,
            system_level=SystemLevel.STATE,
            # Raw files are only ever processed in the PRIMARY ingest bucket
            ingest_instance=DirectIngestInstance.PRIMARY,
            project_id=self.project_id,
        )

        self.mutex = threading.Lock()
        self.collect_progress: Optional[Bar] = None
        self.move_progress: Optional[Bar] = None
        self.moves_list: List[Tuple[str, str]] = []
        self.log_output_path = os.path.join(
            os.path.dirname(__file__),
            f"move_result_{region}_{self.project_id}_start_bound_{self.start_date_bound}_end_bound_"
            f"{self.end_date_bound}_dry_run_{self.dry_run}_{datetime.datetime.now().isoformat()}.txt",
        )
Пример #22
0
 def test_get_state_storage_directory_path(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             "us_nd", SystemLevel.STATE),
         "recidiviz-staging-direct-ingest-state-storage/us_nd",
     )
Пример #23
0
 def test_get_county_storage_directory_path(self) -> None:
     self.assertEqual(
         gcsfs_direct_ingest_storage_directory_path_for_region(
             "us_tx_brazos", SystemLevel.COUNTY),
         "recidiviz-123-direct-ingest-county-storage/us_tx_brazos",
     )
    def get_ingest_instance_summaries(
            self, state_code: StateCode) -> List[Dict[str, Any]]:
        """Returns a list of dictionaries containing the following info for a given instance:
        i.e. {
            instance: the direct ingest instance,
            dbName: database name for this instance,
            storage: storage bucket absolute path,
            ingest: {
                name: bucket_name,
                unprocessedFilesRaw: how many unprocessed raw data files in the bucket,
                processedFilesRaw: how many processed raw data files are in the bucket (should be zero),
                unprocessedFilesIngestView: how many unprocessed ingest view files in the bucket,
                processedFilesIngestView: how many processed ingest view files are in the bucket (should be zero),
            },
            operations: {
                unprocessedFilesRaw: number of unprocessed raw files in the operations database
                unprocessedFilesIngestView: number of unprocessed ingest view files in the operations database
                dateOfEarliestUnprocessedIngestView: date of earliest unprocessed ingest file, if it exists
            }
        }
        """
        formatted_state_code = state_code.value.lower()

        ingest_instance_summaries: List[Dict[str, Any]] = []
        for instance in DirectIngestInstance:
            # Get the ingest bucket path
            ingest_bucket_path = gcsfs_direct_ingest_bucket_for_region(
                region_code=formatted_state_code,
                system_level=SystemLevel.STATE,
                ingest_instance=instance,
                project_id=self.project_id,
            )
            # Get an object containing information about the ingest bucket
            ingest_bucket_metadata = self._get_bucket_metadata(
                ingest_bucket_path)

            # Get the storage bucket for this instance
            storage_bucket_path = gcsfs_direct_ingest_storage_directory_path_for_region(
                region_code=formatted_state_code,
                system_level=SystemLevel.STATE,
                ingest_instance=instance,
                project_id=self.project_id,
            )

            # Get the database name corresponding to this instance
            ingest_db_name = self._get_database_name_for_state(
                state_code, instance)

            # Get the operations metadata for this ingest instance
            operations_db_metadata = self._get_operations_db_metadata(
                state_code, ingest_db_name)

            ingest_instance_summary: Dict[str, Any] = {
                "instance": instance.value,
                "storage": storage_bucket_path.abs_path(),
                "ingest": ingest_bucket_metadata,
                "dbName": ingest_db_name,
                "operations": operations_db_metadata,
            }

            ingest_instance_summaries.append(ingest_instance_summary)

        return ingest_instance_summaries
Пример #25
0
def main() -> None:
    """Executes the main flow of the script."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--region", required=True, help="E.g. 'us_nd'")

    parser.add_argument(
        "--source-project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )

    parser.add_argument(
        "--destination-project-id",
        choices=[GCP_PROJECT_STAGING, GCP_PROJECT_PRODUCTION],
        help=
        "Used to select which GCP project against which to run this script.",
        required=True,
    )

    parser.add_argument(
        "--dry-run",
        default=True,
        type=str_to_bool,
        help=
        "Runs copy in dry-run mode, only prints the file copies it would do.",
    )

    parser.add_argument(
        "--start-date-bound",
        help=
        "The lower bound date to start from, inclusive. For partial copying of ingested files. "
        "E.g. 2019-09-23.",
    )

    parser.add_argument(
        "--end-date-bound",
        help=
        "The upper bound date to end at, inclusive. For partial copying of ingested files. "
        "E.g. 2019-09-23.",
    )

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO, format="%(message)s")

    source_region_storage_dir_path = gcsfs_direct_ingest_storage_directory_path_for_region(
        region_code=args.region,
        system_level=SystemLevel.STATE,
        # Raw files are only ever stored in the PRIMARY storage bucket
        ingest_instance=DirectIngestInstance.PRIMARY,
        project_id=args.source_project_id,
    )
    destination_region_storage_dir_path = gcsfs_direct_ingest_storage_directory_path_for_region(
        region_code=args.region,
        system_level=SystemLevel.STATE,
        # Raw files are only ever stored in the PRIMARY storage bucket
        ingest_instance=DirectIngestInstance.PRIMARY,
        project_id=args.destination_project_id,
    )
    CopyStorageIngestFilesController(
        region_code=args.region,
        source_region_storage_dir_path=source_region_storage_dir_path,
        destination_region_storage_dir_path=destination_region_storage_dir_path,
        file_type_to_copy=GcsfsDirectIngestFileType.RAW_DATA,
        start_date_bound=args.start_date_bound,
        end_date_bound=args.end_date_bound,
        dry_run=args.dry_run,
    ).run()