def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[GcsfsDirectoryPath] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []
        self.skipped_files: List[str] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (gcsfs_sftp_download_bucket_path_for_region(
            region, SystemLevel.STATE, project_id=self.project_id) if
                       gcs_destination_path is None else gcs_destination_path)
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY)

        self.postgres_direct_ingest_file_metadata_manager = (
            PostgresDirectIngestRawFileMetadataManager(
                region,
                DirectIngestInstance.PRIMARY.database_version(
                    SystemLevel.STATE,
                    state_code=StateCode(self.region.upper())).name,
            ))
    def _move_files(self, from_uri: str):
        curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri)
        previous_date_format = filename_parts_from_path(
            curr_gcsfs_file_path).date_str
        new_date_format = date.fromisoformat(previous_date_format).strftime(
            "%Y/%m/%d/")

        path_with_new_file_name = GcsfsFilePath.from_absolute_path(
            to_normalized_unprocessed_file_path_from_normalized_path(
                from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path):
            path_with_new_file_name = GcsfsFilePath.from_absolute_path(
                to_normalized_processed_file_path_from_normalized_path(
                    from_uri, GcsfsDirectIngestFileType.RAW_DATA))

        raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir(
            self.region_storage_raw_dir_path, new_date_format)

        to_uri = GcsfsFilePath.from_directory_and_file_name(
            raw_dir_with_date, path_with_new_file_name.file_name).uri()

        if not self.dry_run:
            gsutil_mv(from_path=from_uri, to_path=to_uri)
        with self.mutex:
            self.move_list.append((from_uri, to_uri))
            if self.move_progress:
                self.move_progress.next()
예제 #3
0
    def __init__(
        self,
        project_id: str,
        region: str,
        lower_bound_update_datetime: Optional[datetime.datetime],
        gcs_destination_path: Optional[str] = None,
    ):
        self.project_id = project_id
        self.region = region.lower()

        self.auth = SftpAuth.for_region(region)
        self.delegate = SftpDownloadDelegateFactory.build(region_code=region)
        self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build())

        self.unable_to_download_items: List[str] = []
        self.downloaded_items: List[Tuple[str, datetime.datetime]] = []

        self.lower_bound_update_datetime = lower_bound_update_datetime
        self.bucket = (
            GcsfsDirectoryPath.from_absolute_path(
                gcsfs_direct_ingest_directory_path_for_region(
                    region, SystemLevel.STATE, project_id=self.project_id
                )
            )
            if gcs_destination_path is None
            else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path)
        )
        self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir(
            dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY
        )
예제 #4
0
def gcsfs_direct_ingest_storage_directory_path_for_region(
    *,
    region_code: str,
    system_level: SystemLevel,
    ingest_instance: DirectIngestInstance,
    file_type: Optional[GcsfsDirectIngestFileType] = None,
    project_id: Optional[str] = None,
) -> GcsfsDirectoryPath:
    if project_id is None:
        project_id = metadata.project_id()
        if not project_id:
            raise ValueError("Project id not set")

    suffix = bucket_suffix_for_ingest_instance(ingest_instance)
    bucket_name = build_ingest_storage_bucket_name(
        project_id=project_id,
        system_level_str=system_level.value.lower(),
        suffix=suffix,
    )
    storage_bucket = GcsfsBucketPath(bucket_name)

    if file_type is not None:
        subdir = os.path.join(region_code.lower(), file_type.value)
    else:
        subdir = region_code.lower()
    return GcsfsDirectoryPath.from_dir_and_subdir(storage_bucket, subdir)
예제 #5
0
    def __init__(
        self,
        *,
        state_code: StateCode,
        sandbox_dataset_prefix: str,
        test_ingest_bucket: GcsfsBucketPath,
    ):

        check_is_valid_sandbox_bucket(test_ingest_bucket)

        super().__init__(
            region=get_region(state_code.value.lower(), is_direct_ingest=True),
            fs=DirectIngestGCSFileSystem(GcsfsFactory.build()),
            ingest_bucket_path=test_ingest_bucket,
            temp_output_directory_path=GcsfsDirectoryPath.from_dir_and_subdir(
                test_ingest_bucket, "temp_raw_data"
            ),
            big_query_client=BigQueryClientImpl(),
        )
        self.sandbox_dataset = (
            f"{sandbox_dataset_prefix}_{super()._raw_tables_dataset()}"
        )