def publish_versions(prod_folder: str = "v3") -> Response: """Lists all the blobs in the bucket with generation.""" prod_folder = _get_request_param("prod_folder", prod_folder) # Enumerate all the versions for each of the global tables prefix = prod_folder + "/" blob_index: Dict[str, List[str]] = {} bucket = get_storage_bucket(GCS_BUCKET_PROD) for table_name in ["aggregated", "main"] + list(get_table_names()): blobs = bucket.list_blobs(prefix=prefix + table_name, versions=True) for blob in blobs: fname = blob.name.replace(prefix, "") blob_index[fname] = blob_index.get(fname, []) blob_index[fname].append(blob.generation) # Repeat the process for the intermediate tables bucket = get_storage_bucket(GCS_BUCKET_TEST) blobs = bucket.list_blobs(prefix="intermediate/", versions=True) for blob in blobs: # Keep the "intermediate/" prefix to distinguish from the tables fname = blob.name blob_index[fname] = blob_index.get(fname, []) blob_index[fname].append(blob.generation) with temporary_directory() as workdir: # Write it to disk fname = workdir / "versions.json" with open(fname, "w") as fh: json.dump(blob_index, fh) # Upload to root folder upload_folder(GCS_BUCKET_PROD, prod_folder + "/", workdir) return Response("OK", status=200)
def download_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _download_blob(local_folder: Path, blob: Blob) -> None: # Remove the prefix from the remote path rel_path = blob.name.split(f"{remote_path}/", 1)[-1] if filter_func is None or filter_func(Path(rel_path)): logger.log_debug(f"Downloading {rel_path} to {local_folder}/") file_path = local_folder / rel_path file_path.parent.mkdir(parents=True, exist_ok=True) for i in range(BLOB_OP_MAX_RETRIES): try: return blob.download_to_filename(str(file_path)) except Exception as exc: log_message = f"Error downloading {rel_path}." logger.log_warning(log_message, traceback=traceback.format_exc()) # Exponential back-off time.sleep(2 ** i) # If error persists, there must be something wrong with the network so we are better # off crashing the appengine server. error_message = f"Error downloading {rel_path}" logger.log_error(error_message) raise IOError(error_message) map_func = partial(_download_blob, local_folder) map_iter = bucket.list_blobs(prefix=remote_path) list(thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
def cache_build_map() -> Dict[str, List[str]]: sitemap: Dict[str, List[str]] = {} bucket = get_storage_bucket(GCS_BUCKET_PROD) for blob in bucket.list_blobs(prefix="cache"): filename = blob.name.split("/")[-1] if filename == "sitemap.json": continue sitemap_key = filename.split(".")[0] sitemap[sitemap_key] = sitemap.get(sitemap_key, []) sitemap[sitemap_key].append(blob.name) # Sort all the cache items for sitemap_key, snapshot_list in sitemap.items(): sitemap[sitemap_key] = list(sorted(snapshot_list)) return sitemap
def upload_folder( bucket_name: str, remote_path: str, local_folder: Path, filter_func: Callable[[Path], bool] = None, ) -> None: bucket = get_storage_bucket(bucket_name) def _upload_file(remote_path: str, file_path: Path): target_path = file_path.relative_to(local_folder) if filter_func is None or filter_func(target_path): logger.log_debug(f"Uploading {target_path} to {remote_path}/") blob = bucket.blob(os.path.join(remote_path, target_path)) for i in range(BLOB_OP_MAX_RETRIES): try: name, suffix = file_path.name, file_path.suffix # If it's an extension we should compress, upload compressed file if suffix[1:] in COMPRESS_EXTENSIONS: with temporary_directory() as workdir: gzipped_file = workdir / name gzip_file(file_path, gzipped_file) blob.content_encoding = "gzip" return blob.upload_from_filename(gzipped_file) # Otherwise upload the file as-is else: return blob.upload_from_filename(file_path) except Exception as exc: log_message = f"Error uploading {target_path}." logger.log_warning(log_message, traceback=traceback.format_exc()) # Exponential back-off time.sleep(2**i) # If error persists, there must be something wrong with the network so we are better # off crashing the appengine server. error_message = f"Error uploading {target_path}" logger.log_error(error_message) raise IOError(error_message) map_func = partial(_upload_file, remote_path) map_iter = local_folder.glob("**/*.*") list( thread_map(map_func, map_iter, total=None, disable=True, max_workers=8))
def cache_pull() -> Response: with temporary_directory() as workdir: now = datetime.datetime.utcnow() output_folder = workdir / now.strftime("%Y-%m-%d-%H") output_folder.mkdir(parents=True, exist_ok=True) def _pull_source(cache_source: Dict[str, str]): url = cache_source.pop("url") data = cache_source.pop("data", None) output = cache_source.pop("output") logger.log_info(f"Downloading {url} into {output}") buffer = BytesIO() try: download(url, buffer, data=data) with (output_folder / output).open("wb") as fd: fd.write(buffer.getvalue()) logger.log_info(f"Downloaded {output} successfully") except: logger.log_error(f"Cache pull failed for {url}.", traceback=traceback.format_exc()) # Pull each of the sources from the cache config with (SRC / "cache.yaml").open("r") as fd: cache_list = yaml.safe_load(fd) list(thread_map(_pull_source, cache_list, disable=True)) # Upload all cached data to the bucket upload_folder(GCS_BUCKET_PROD, "cache", workdir) # Build the sitemap for all cached files logger.log_info("Building sitemap") sitemap = cache_build_map() bucket = get_storage_bucket(GCS_BUCKET_PROD) blob = bucket.blob("cache/sitemap.json") blob.upload_from_string(json.dumps(sitemap)) return Response("OK", status=200)
def download_file(bucket_name: str, remote_path: str, local_path: str) -> None: bucket = get_storage_bucket(bucket_name) # print(f"Downloading {remote_path} to {local_path}") return bucket.blob(remote_path).download_to_filename(str(local_path))