def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[GcsfsDirectoryPath] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.skipped_files: List[str] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = (gcsfs_sftp_download_bucket_path_for_region( region, SystemLevel.STATE, project_id=self.project_id) if gcs_destination_path is None else gcs_destination_path) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY) self.postgres_direct_ingest_file_metadata_manager = ( PostgresDirectIngestRawFileMetadataManager( region, DirectIngestInstance.PRIMARY.database_version( SystemLevel.STATE, state_code=StateCode(self.region.upper())).name, ))
def _move_files(self, from_uri: str): curr_gcsfs_file_path = GcsfsFilePath.from_absolute_path(from_uri) previous_date_format = filename_parts_from_path( curr_gcsfs_file_path).date_str new_date_format = date.fromisoformat(previous_date_format).strftime( "%Y/%m/%d/") path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_unprocessed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) if DirectIngestGCSFileSystem.is_processed_file(curr_gcsfs_file_path): path_with_new_file_name = GcsfsFilePath.from_absolute_path( to_normalized_processed_file_path_from_normalized_path( from_uri, GcsfsDirectIngestFileType.RAW_DATA)) raw_dir_with_date = GcsfsDirectoryPath.from_dir_and_subdir( self.region_storage_raw_dir_path, new_date_format) to_uri = GcsfsFilePath.from_directory_and_file_name( raw_dir_with_date, path_with_new_file_name.file_name).uri() if not self.dry_run: gsutil_mv(from_path=from_uri, to_path=to_uri) with self.mutex: self.move_list.append((from_uri, to_uri)) if self.move_progress: self.move_progress.next()
def __init__( self, project_id: str, region: str, lower_bound_update_datetime: Optional[datetime.datetime], gcs_destination_path: Optional[str] = None, ): self.project_id = project_id self.region = region.lower() self.auth = SftpAuth.for_region(region) self.delegate = SftpDownloadDelegateFactory.build(region_code=region) self.gcsfs = DirectIngestGCSFileSystem(GcsfsFactory.build()) self.unable_to_download_items: List[str] = [] self.downloaded_items: List[Tuple[str, datetime.datetime]] = [] self.lower_bound_update_datetime = lower_bound_update_datetime self.bucket = ( GcsfsDirectoryPath.from_absolute_path( gcsfs_direct_ingest_directory_path_for_region( region, SystemLevel.STATE, project_id=self.project_id ) ) if gcs_destination_path is None else GcsfsDirectoryPath.from_absolute_path(gcs_destination_path) ) self.download_dir = GcsfsDirectoryPath.from_dir_and_subdir( dir_path=self.bucket, subdir=RAW_INGEST_DIRECTORY )
def gcsfs_direct_ingest_storage_directory_path_for_region( *, region_code: str, system_level: SystemLevel, ingest_instance: DirectIngestInstance, file_type: Optional[GcsfsDirectIngestFileType] = None, project_id: Optional[str] = None, ) -> GcsfsDirectoryPath: if project_id is None: project_id = metadata.project_id() if not project_id: raise ValueError("Project id not set") suffix = bucket_suffix_for_ingest_instance(ingest_instance) bucket_name = build_ingest_storage_bucket_name( project_id=project_id, system_level_str=system_level.value.lower(), suffix=suffix, ) storage_bucket = GcsfsBucketPath(bucket_name) if file_type is not None: subdir = os.path.join(region_code.lower(), file_type.value) else: subdir = region_code.lower() return GcsfsDirectoryPath.from_dir_and_subdir(storage_bucket, subdir)
def __init__( self, *, state_code: StateCode, sandbox_dataset_prefix: str, test_ingest_bucket: GcsfsBucketPath, ): check_is_valid_sandbox_bucket(test_ingest_bucket) super().__init__( region=get_region(state_code.value.lower(), is_direct_ingest=True), fs=DirectIngestGCSFileSystem(GcsfsFactory.build()), ingest_bucket_path=test_ingest_bucket, temp_output_directory_path=GcsfsDirectoryPath.from_dir_and_subdir( test_ingest_bucket, "temp_raw_data" ), big_query_client=BigQueryClientImpl(), ) self.sandbox_dataset = ( f"{sandbox_dataset_prefix}_{super()._raw_tables_dataset()}" )