def get_staging_area_file_descriptors( storage_client: Client, staging_areas: set[str]) -> dict[str, set[PathWithCrc]]: """ Given a set of GS staging areas, return the downloaded descriptors present in each area """ expected: dict[str, set[PathWithCrc]] = defaultdict(set[PathWithCrc]) for staging_area in staging_areas: url = urlparse(staging_area) for file_type in FileMetadataTypes: prefix = f"{url.path.lstrip('/')}/descriptors/{file_type.value}" blobs = list(storage_client.list_blobs(url.netloc, prefix=prefix)) for blob in blobs: parsed = json.loads(blob.download_as_text()) path_with_crc = PathWithCrc( target_path_from_descriptor(parsed), parsed["crc32c"]) expected[staging_area].add(path_with_crc) return expected
def path_has_any_data(bucket: str, prefix: str, gcs: Client) -> bool: """Checks the given path for any blobs of non-zero size""" blobs = [blob for blob in gcs.list_blobs(bucket, prefix=prefix)] return any([blob.size > 0 for blob in blobs])