def publish_v3_location_subsets( location_key_from: str = None, location_key_until: str = None ) -> Response: location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" intermediate_folder = workdir / "temp" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) location_keys = list(table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [key for key in location_keys if key >= location_key_from] if location_key_until is not None: location_keys = [key for key in location_keys if key <= location_key_until] logger.log_info( f"Publishing {len(location_keys)} location subsets " f"from {location_keys[0]} until {location_keys[-1]}" ) # Download all the global tables into our local storage forbid_tokens = ("/", "main.", "aggregated.") download_folder( GCS_BUCKET_PROD, "v3", input_folder, lambda x: x.suffix == ".csv" and all(token not in str(x) for token in forbid_tokens), ) logger.log_info(f"Downloaded {sum(1 for _ in input_folder.glob('**/*.csv'))} CSV files") # Break out each table into separate folders based on the location key publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST) logger.log_info("Created all table location breakouts") # Create a folder which will host all the location aggregates location_aggregates_folder = output_folder / "location" location_aggregates_folder.mkdir(parents=True, exist_ok=True) # Aggregate the tables for each location independently publish_location_aggregates( intermediate_folder, location_aggregates_folder, location_keys, use_table_names=V3_TABLE_LIST, ) logger.log_info("Aggregated all table breakouts by location") # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v3", output_folder) return Response("OK", status=200)
def publish_v3_location_subsets(location_key_from: str = None, location_key_until: str = None) -> Response: location_key_from = _get_request_param("location_key_from", location_key_from) location_key_until = _get_request_param("location_key_until", location_key_until) with temporary_directory() as workdir: input_folder = workdir / "input" intermediate_folder = workdir / "temp" output_folder = workdir / "output" input_folder.mkdir(parents=True, exist_ok=True) output_folder.mkdir(parents=True, exist_ok=True) location_keys = list( table_read_column(SRC / "data" / "metadata.csv", "key")) if location_key_from is not None: location_keys = [ key for key in location_keys if key >= location_key_from ] if location_key_until is not None: location_keys = [ key for key in location_keys if key <= location_key_until ] logger.log_info(f"Publishing {len(location_keys)} location subsets " f"from {location_keys[0]} until {location_keys[-1]}") # Download all the global tables into our local storage download_folder(GCS_BUCKET_PROD, "v3", input_folder, lambda x: "/" not in str(x)) # Break out each table into separate folders based on the location key publish_location_breakouts(input_folder, intermediate_folder, use_table_names=V3_TABLE_LIST) # Aggregate the tables for each location independently publish_location_aggregates(intermediate_folder, output_folder, location_keys, use_table_names=V3_TABLE_LIST) # Upload the results to the prod bucket upload_folder(GCS_BUCKET_PROD, "v3", output_folder) return Response("OK", status=200)