def gsutil_get_storage_subdirs_containing_file_types(storage_bucket_path: str, file_type: GcsfsDirectIngestFileType, upper_bound_date: Optional[str], lower_bound_date: Optional[str]) -> List[str]: """Returns all subdirs containing files of type |file_type| in the provided |storage_bucket_path| for a given region.""" subdirs = gsutil_ls(f'gs://{storage_bucket_path}', directories_only=True) subdirs_containing_files = [] for outer_subdir_path in subdirs: outer_subdir_name = os.path.basename(os.path.normpath(outer_subdir_path)) if outer_subdir_name == file_type.value: date_subdirs = _dfs_get_date_subdirs([outer_subdir_path]) for date_path in date_subdirs: if is_between_date_strs_inclusive( upper_bound_date=upper_bound_date, lower_bound_date=lower_bound_date, date_of_interest=_date_str_from_date_subdir_path(date_path)): subdirs_containing_files.append(date_path) elif file_type == GcsfsDirectIngestFileType.UNSPECIFIED: # TODO(3162): For now we assume that all files not in raw/ or ingest_view/ storage subdirs are 'raw' # files. Once all files have been migrated to raw/ and ingest_view/ subdirs, delete this part. if not is_date_str(outer_subdir_name): continue if is_between_date_strs_inclusive( upper_bound_date=upper_bound_date, lower_bound_date=lower_bound_date, date_of_interest=outer_subdir_name): subdirs_containing_files.append(outer_subdir_path) return subdirs_containing_files
def _dfs_get_date_subdirs(paths_to_search: List[str], depth: int = 0) -> List[str]: """Traverses down through year/month/day subdirectories to contain list of all date subdirectories that contain files for a given day.""" if depth == 3: return [p for p in paths_to_search if is_date_str(_date_str_from_date_subdir_path(p))] date_subdirs = [] for p in paths_to_search: sub_paths = gsutil_ls(p, directories_only=True) date_subdirs.extend(_dfs_get_date_subdirs(sub_paths, depth=depth + 1)) return date_subdirs
def get_date_subdir_paths(self) -> List[str]: possible_paths = gsutil_ls(f'gs://{self.storage_bucket}') result = [] for path in possible_paths: last_part = os.path.basename(os.path.normpath(path)) if not is_date_str(last_part): continue if is_between_date_strs_inclusive( upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound, date_of_interest=last_part): result.append(path) return result
def _get_subdirs_to_copy(self) -> List[str]: subdirs = gsutil_ls(f'gs://{self.prod_storage_bucket}') subdirs_to_copy = [] for subdir in subdirs: if not subdir.endswith('/'): logging.info("Path [%s] is in unexpected format, skipping", subdir) continue subdir_name = os.path.basename(os.path.normpath(subdir)) if not is_date_str(subdir_name): continue if is_between_date_strs_inclusive( upper_bound_date=self.end_date_bound, lower_bound_date=self.start_date_bound, date_of_interest=subdir_name): subdirs_to_copy.append(subdir_name) return subdirs_to_copy