def convert_tables_to_json(csv_folder: Path, output_folder: Path, **tqdm_kwargs) -> Iterable[Path]: # Convert all CSV files to JSON using values format map_iter = list(csv_folder.glob("**/*.csv")) map_opts = dict(total=len(map_iter), desc="Converting to JSON", **tqdm_kwargs) map_func = partial(_try_json_covert, get_schema(), csv_folder, output_folder) return list(pbar(map(map_func, map_iter), **map_opts))
def publish_location_breakouts( tables_folder: Path, output_folder: Path, use_table_names: List[str] = None) -> List[Path]: """ Breaks out each of the tables in `tables_folder` based on location key, and writes them into subdirectories of `output_folder`. Arguments: tables_folder: Directory containing input CSV files. output_folder: Output path for the resulting location data. """ # Default to a known list of tables to use when none is given map_iter = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Break out each table into separate folders based on the location key _logger.log_info(f"Breaking out tables {[x.stem for x in map_iter]}") map_func = partial(table_breakout, output_folder=output_folder, breakout_column="location_key") return list( pbar(map(map_func, map_iter), desc="Breaking out tables", total=len(map_iter)))
def _test_join_all(self, how_mem: str, how_pandas: str): # Create a custom function used to read tables casting to the expected schema read_table_ = partial(read_table, schema=SCHEMA, low_memory=False) # Test joining the index table with every other table left = SRC / "test" / "data" / "index.csv" for right in pbar([*(SRC / "test" / "data").glob("*.csv")], leave=False): if left.name == right.name: continue left_columns = get_table_columns(left) right_columns = get_table_columns(right) if not "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key"], how_mem, how_pandas) if "date" in left_columns and not "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key"], how_mem, how_pandas) if "date" in left_columns and "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key", "date"], how_mem, how_pandas)
def publish_location_aggregates( breakout_folder: Path, output_folder: Path, location_keys: Iterable[str], use_table_names: List[str] = None, **tqdm_kwargs, ) -> List[Path]: """ This method joins *all* the tables for each location into a main.csv table. Arguments: tables_folder: Directory containing input CSV files. output_folder: Output path for the resulting location data. location_keys: List of location keys to do aggregation for. """ # Create a main.csv file for each of the locations in parallel map_iter = list(location_keys) _logger.log_info(f"Aggregating outputs for {len(map_iter)} location keys") map_opts = dict(total=len(map_iter), desc="Creating location subsets", **tqdm_kwargs) map_func = partial( _aggregate_location_breakouts, breakout_folder, output_folder, use_table_names=use_table_names, ) return list(pbar(map(map_func, map_iter), **map_opts))
def _process_state(data: DataFrame) -> DataFrame: data["date"] = data["date"].apply(lambda x: str(x)[:10]) data["subregion2_code"] = data["fips_code"].apply( lambda x: numeric_code_as_string(x, 5)) data["key"] = "US_" + data["state"] + "_" + data["subregion2_code"] data.drop( columns=[ "subregion2_code", "state", "fips_code", "county", "report_date_window_end", "report_date_window_start", ], inplace=True, ) # Make sure the data is properly sorted, since we need to compute diffs data.sort_values(["key", "date"], inplace=True) # Get a mapping between rolling average column names and their daily counterparts col_prefixes = ( "new_cases", "new_deaths", "new_test_results_reported", "admissions_covid_confirmed", ) rolling_suffix = "_7_day_rolling_average" rolling_columns_map = { col + rolling_suffix: col.replace(rolling_suffix, "") for col in col_prefixes } # Seed the daily versions of the columns with empty values for name in rolling_columns_map.values(): data[name] = None # Convert the rolling average columns to daily values one key at a time # This can probably be done with some clever grouping function instead, but iteratively is # fast enough and it works reliably. for key in pbar(data["key"].unique(), desc="Computing daily values from rolling means"): mask = data["key"] == key for col, name in rolling_columns_map.items(): subset = data.loc[mask, col].dropna() data.loc[subset.index, name] = recover_from_rolling_mean(subset, 7) # Get rid of unnecessary columns now that we have the daily values data.drop(columns=rolling_columns_map.keys(), inplace=True) return table_rename( data, { "new_cases": "new_confirmed", "new_deaths": "new_deceased", "new_test_results_reported": "new_tested", "admissions_covid_confirmed": "new_hospitalized", }, )
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Get all the weather stations with data up until last month from inventory year = int(parse_opts.get("year")) if "year" in parse_opts else None cur_date = datetime.date(year, 12, 31) if year else datetime.date.today() min_date = (cur_date - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_file( sources["inventory"]).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Open the station data as a compressed file station_cache = dict() with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar: # Build the station cache by decompressing all files in memory map_iter = stations_tar.getmembers() map_func = partial(_extract_station, stations_tar) map_opts = dict(desc="Decompressing", total=len(map_iter)) for station_item in pbar(map(map_func, map_iter), **map_opts): station_cache.update(station_item) # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = read_file(sources["geography"])[keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations["lat"].apply(math.radians) stations["lon"] = stations["lon"].apply(math.radians) metadata["lat"] = metadata["latitude"].apply(math.radians) metadata["lon"] = metadata["longitude"].apply(math.radians) # Use a manager to handle memory accessed across processes manager = Manager() station_cache = manager.dict(station_cache) # Make sure the stations and the cache are sent to each function call map_func = partial(_process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = (record for _, record in metadata.iterrows()) # Bottleneck is network so we can use lots of threads in parallel records = process_map(map_func, map_iter, total=len(metadata)) return concat(records)
def copy_tables(tables_folder: Path, public_folder: Path) -> None: """ Copy tables as-is from the tables folder into the public folder. Arguments: tables_folder: Input folder where all CSV files exist. public_folder: Output folder where the CSV files will be copied to. """ for output_file in pbar([*tables_folder.glob("*.csv")], desc="Copy tables"): shutil.copy(output_file, public_folder / output_file.name)
def publish_subset_latest(tables_folder: Path, output_folder: Path, key: str = "location_key", **tqdm_kwargs) -> List[Path]: """ This method outputs the latest record by date per location key for each of the input tables. Arguments: tables_folder: Directory containing input CSV files. output_folder: Output path for the resulting data. key: Column name to group by. """ agg_table_name = "aggregated" # Create a latest subset version for each of the tables in parallel map_iter = [ table for table in tables_folder.glob("*.csv") if table.stem != agg_table_name ] _logger.log_info(f"Computing latest subset for {len(map_iter)} tables") map_opts = dict(total=len(map_iter), desc="Creating latest subsets", **tqdm_kwargs) map_func = partial(_grouped_subset_latest, output_folder, group_column=key) for table in pbar(map(map_func, map_iter), **map_opts): yield table # Use a temporary directory for intermediate files with temporary_directory() as workdir: latest_dates_table = workdir / "dates.csv" latest_dates_map = _latest_date_by_group(output_folder, group_by=key) with open(latest_dates_table, "w") as fh: fh.write("location_key,date\n") for location_key, date in latest_dates_map.items(): fh.write(f"{location_key},{date}\n") join_table_paths = [latest_dates_table] tables_in = (table for table in output_folder.glob("*.csv") if table.stem in V3_TABLE_LIST) for table_file in tables_in: table_columns = get_table_columns(table_file) if "date" not in table_columns: join_table_paths.append(table_file) else: tmp_file = workdir / table_file.name table_rename(table_file, tmp_file, {"date": None}) join_table_paths.append(tmp_file) # Join them all into a single file for the aggregate version output_agg = output_folder / f"{agg_table_name}.csv" table_merge(join_table_paths, output_agg, on=[key], how="OUTER") yield output_agg
def create_table_subsets(main_table_path: Path, output_path: Path) -> Iterable[Path]: # Create subsets with each known key yield from _subset_grouped_key(main_table_path, output_path, desc="Grouped key subsets") # Create a subfolder to store the latest row for each key latest_path = output_path / "latest" latest_path.mkdir(parents=True, exist_ok=True) # Create a subset with the latest known day of data for each key map_opts = dict(desc="Latest subset") map_func = partial(_grouped_subset_latest, latest_path) map_iter = list(output_path.glob("*.csv")) + [main_table_path] yield from pbar(map(map_func, map_iter), **map_opts)
def parse_dataframes(self, dataframes: Dict[str, DataFrame], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: data = table_rename( dataframes[0], { "Country Name": "country_name", "Country Code": "3166-1-alpha-3", "Indicator Name": "indicator_name", "Indicator Code": "indicator_code", }, drop=False, ) # Ensure all columns are of str type data.columns = list(map(str, data.columns)) data = data.merge(aux["worldbank_indicators"]).merge( aux["country_codes"]) data = data.drop( columns=["country_name", "3166-1-alpha-2", "3166-1-alpha-3"]) indicators = parse_opts.get( "indicators", {code: code for code in data.indicator_code.values}) min_year = int(parse_opts.get("min_year", 2015)) data = data[data.indicator_code.isin(indicators.values())] # Index data by indicator code for performance optimization keys = data.key.unique() indexed = { key: data[data.key == key].set_index("indicator_code") for key in keys } # There is probably a fancy pandas function to this more efficiently but this works for now map_func = partial(self._process_record, indexed, indicators, min_year) records = list( pbar(map(map_func, keys), desc="WorldBank Indicators", total=len(keys))) # Some countries are better described as subregions data = DataFrame.from_records(records) data.loc[data.key == "MF", "key"] = "FR_MF" # Return all records in DataFrame form return data
def parse(self, sources: List[str], aux: Dict[str, DataFrame], **parse_opts): data = None with zipfile.ZipFile(sources[0]) as zipped: data = zipped.read("WDIData.csv") data = read_csv(BytesIO(data)) assert data is not None data = data.rename( columns={ "Country Code": "3166-1-alpha-3", "Indicator Name": "indicator_name", "Indicator Code": "indicator_code", }) data = data.merge(aux["worldbank_indicators"]).merge( aux["country_codes"]) data = data.drop( columns=["Country Name", "3166-1-alpha-2", "3166-1-alpha-3"]) indicators = parse_opts.get( "indicators", {code: code for code in data.indicator_code.values}) min_year = int(parse_opts.get("min_year", 2015)) data = data[data.indicator_code.isin(indicators.values())] # Index data by indicator code for performance optimization keys = data.key.unique() indexed = { key: data[data.key == key].set_index("indicator_code") for key in keys } # There is probably a fancy pandas function to this more efficiently but this works for now map_func = partial(self._process_record, indexed, indicators, min_year) records = list( pbar(map(map_func, keys), desc="WorldBank Indicators", total=len(keys))) # Some countries are better described as subregions data = DataFrame.from_records(records) data.loc[data.key == "MF", "key"] = "FR_MF" # Return all records in DataFrame form return data
def merge_location_breakout_tables(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires for all the location breakout tables to be present under `tables_folder`. Arguments: tables_folder: Input directory where all CSV files exist. output_path: Output directory for the resulting main.csv file. """ # Use only the aggregated main tables table_paths = list(sorted(tables_folder.glob("**/*.csv"))) # Concatenate all the individual breakout tables together _logger.log_info( f"Concatenating {len(table_paths)} location breakout tables") table_concat(pbar(table_paths, desc="Concatenating tables"), output_path)
def _subset_grouped_key(main_table_path: Path, output_folder: Path, desc: str = None) -> Iterable[Path]: """ Outputs a subsets of the table with only records with a particular key """ # Read the header of the main file to get the columns with open(main_table_path, "r") as fd: header = next(fd) # Do a first sweep to get the number of keys so we can accurately report progress key_set = set() for line in read_lines(main_table_path, skip_empty=True): key, data = line.split(",", 1) key_set.add(key) # We make use of the main table being sorted by <key, date> and do a linear sweep of the file # assuming that once the key changes we won't see it again in future lines key_folder: Path = None current_key: str = None file_handle: TextIO = None progress_bar = pbar(total=len(key_set), desc=desc) for idx, line in enumerate(read_lines(main_table_path, skip_empty=True)): key, data = line.split(",", 1) # Skip the header line if idx == 0: continue # When the key changes, close the previous file handle and open a new one if current_key != key: if file_handle: file_handle.close() if key_folder: yield key_folder / "main.csv" current_key = key key_folder = output_folder / key key_folder.mkdir(exist_ok=True) file_handle = (key_folder / "main.csv").open("w") file_handle.write(f"{header}") progress_bar.update(1) file_handle.write(f"{key},{data}") # Close the last file handle and we are done file_handle.close() progress_bar.close()
def test_convert_csv_to_json_records(self): for json_convert_method in ( _convert_csv_to_json_records_fast, _convert_csv_to_json_records_slow, ): with temporary_directory() as workdir: for csv_file in pbar([*(SRC / "test" / "data").glob("*.csv")], leave=False): json_output = workdir / csv_file.name.replace( "csv", "json") json_convert_method(SCHEMA, csv_file, json_output) with json_output.open("r") as fd: json_obj = json.load(fd) json_df = DataFrame(data=json_obj["data"], columns=json_obj["columns"]) csv_test_file = workdir / json_output.name.replace( "json", "csv") export_csv(json_df, csv_test_file, schema=SCHEMA) _compare_tables_equal(self, csv_file, csv_test_file)
data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv") rename_columns = { col: snake_to_camel_case(col) for col in data.columns } export_csv(data.rename(columns=rename_columns), public_folder / f"{v1_name}.csv") # Create the v1 forecast.csv file export_csv( build_forecast(read_file(public_folder / "data_minimal.csv")), public_folder / "data_forecast.csv", ) # Convert all v1 CSV files to JSON using record format for csv_file in pbar([*(public_folder).glob("*.csv")], desc="V1 JSON conversion"): data = read_file(csv_file, low_memory=False) json_path = str(csv_file).replace("csv", "json") data.to_json(json_path, orient="records") # Create the v2 folder v2_folder = public_folder / "v2" v2_folder.mkdir(exist_ok=True, parents=True) # Download the v2 tables which can fit under 100MB for table_name in pbar( ( "by-age", "by-sex", "demographics", "economy",
def wikidata_property( prop: str, entities: List[str], query_template: str = _WD_QUERY, logger: ErrorLogger = ErrorLogger(), offset: int = 0, **tqdm_kwargs, ) -> Any: """ Query a single property from Wikidata, and return all entities which are part of the provided list which contain that property. Arguments: prop: Wikidata property, for example P1082 for population. entities: List of Wikidata identifiers to query the desired property. query: [Optional] SPARQL query used to retrieve `prop`. logger: [Optional] ErrorLogger instance to use for logging. offset: [Optional] Number of items to skip in the result set. Returns: Iterable[Tuple[str, Any]]: Iterable of <Wikidata ID, property value> """ # Time to wait before retry in case of failure wait_time = _INIT_WAIT_TIME # Build the query from template tpl = query_template + " LIMIT {limit} OFFSET {offset}" query = tpl.format(prop=prop, limit=_LIMIT, offset=offset) # Keep trying request until succeeds, or _max_retries is reached for i in range(_MAX_RETRIES): response = None try: start_time = time.monotonic() params = {"query": query, "format": "json"} req_opts = dict(headers=_REQUEST_HEADER, params=params, timeout=_WD_TIMEOUT) response = requests.get(_WD_URL, **req_opts) elapsed_time = time.monotonic() - start_time log_opts = dict(status=response.status_code, url=_WD_URL, time=elapsed_time, **params) logger.log_info(f"Wikidata SPARQL server response", **log_opts) data = response.json() # Return the first binding available (there should be only one) for item in pbar(data["results"]["bindings"], **tqdm_kwargs): pid = item["pid"]["value"].split("/")[-1] if pid in entities: yield pid, item["prop"]["value"] # Unless we got `_LIMIT` results, keep adding offset until we run our of results if len(data["results"]["bindings"]) == _LIMIT: yield from wikidata_property( prop, entities, query_template=query_template, logger=logger, offset=offset + _LIMIT, **tqdm_kwargs, ) # If no exceptions were thrown, we have reached the end logger.log_info(f"Wikidata SPARQL results end reached") return except Exception as exc: # If we have reached the error limit, log and re-raise the error if i + 1 >= _MAX_RETRIES: msg = response.text if response is not None else "Unknown error" logger.log_error(msg, exc=exc, traceback=traceback.format_exc()) raise exc # Use exponential backoff in case of error logger.log_info( f"({i + 1}) Request error. Retry in {wait_time} seconds...", exc=exc) time.sleep(wait_time) wait_time *= 2
def main(): # Create the folder which will be published public_folder = SRC / ".." / "output" / "public" public_folder.mkdir(exist_ok=True, parents=True) # Create the v1 data.csv file main_table = read_file(f"{URL_OUTPUTS_PROD}/main.csv", low_memory=False) data = main_table[main_table.aggregation_level < 2] rename_columns = { "date": "Date", "key": "Key", "country_code": "CountryCode", "country_name": "CountryName", "subregion1_code": "RegionCode", "subregion1_name": "RegionName", "total_confirmed": "Confirmed", "total_deceased": "Deaths", "latitude": "Latitude", "longitude": "Longitude", "population": "Population", } data = data[rename_columns.keys()].rename(columns=rename_columns) data = data.dropna(subset=["Confirmed", "Deaths"], how="all") data = data.sort_values(["Date", "Key"]) export_csv(data, public_folder / "data.csv") # Create the v1 data_minimal.csv file export_csv(data[["Date", "Key", "Confirmed", "Deaths"]], public_folder / "data_minimal.csv") # Create the v1 data_latest.csv file latest = main_table[main_table.aggregation_level < 2] latest = latest.sort_values("date").groupby("key").last().reset_index() latest = latest[rename_columns.keys()].rename(columns=rename_columns) latest = latest.dropna(subset=["Confirmed", "Deaths"], how="all") latest = latest.sort_values(["Key", "Date"]) export_csv(latest, public_folder / "data_latest.csv") # Create the v1 weather.csv file weather = read_file(f"{URL_OUTPUTS_PROD}/weather.csv") weather = weather[weather.key.apply(lambda x: len(x.split("_")) < 3)] weather = weather.rename(columns={"noaa_distance": "distance", "noaa_station": "station"}) rename_columns = {col: snake_to_camel_case(col) for col in weather.columns} export_csv(weather.rename(columns=rename_columns), public_folder / "weather.csv") # Create the v1 mobility.csv file mobility = read_file(f"{URL_OUTPUTS_PROD}/mobility.csv") mobility = mobility[mobility.key.apply(lambda x: len(x.split("_")) < 3)] mobility = drop_na_records(mobility, ["date", "key"]) rename_columns = { col: snake_to_camel_case(col).replace("Mobility", "") for col in mobility.columns } export_csv(mobility.rename(columns=rename_columns), public_folder / "mobility.csv") # Create the v1 CSV files which only require simple column mapping v1_v2_name_map = {"response": "oxford-government-response"} for v1_name, v2_name in v1_v2_name_map.items(): data = read_file(f"{URL_OUTPUTS_PROD}/{v2_name}.csv") rename_columns = {col: snake_to_camel_case(col) for col in data.columns} export_csv(data.rename(columns=rename_columns), public_folder / f"{v1_name}.csv") # Create the v1 forecast.csv file export_csv( build_forecast(read_file(public_folder / "data_minimal.csv")), public_folder / "data_forecast.csv", ) # Convert all v1 CSV files to JSON using record format for csv_file in pbar([*(public_folder).glob("*.csv")], desc="V1 JSON conversion"): data = read_file(csv_file, low_memory=False) json_path = str(csv_file).replace("csv", "json") data.to_json(json_path, orient="records") # Create the v2 folder v2_folder = public_folder / "v2" v2_folder.mkdir(exist_ok=True, parents=True) # Download the v2 tables which can fit under 100MB for table_name in pbar( ( "by-age", "by-sex", "demographics", "economy", "epidemiology", "geography", "health", "hospitalizations", "index", "mobility", "oxford-government-response", "weather", "worldbank", "worldpop", ), desc="V2 download", ): for ext in ("csv", "json"): with tempfile.NamedTemporaryFile() as tmp: tmp_path = Path(tmp.name) download(f"{URL_OUTPUTS_PROD}/{table_name}.{ext}", tmp) # Check that the output is less than 100 MB before copying it to the output folder if tmp_path.stat().st_size < 100 * 1000 * 1000: shutil.copyfile(tmp_path, v2_folder / f"{table_name}.{ext}")
def parse(self, sources: Dict[str, str], aux: Dict[str, DataFrame], **parse_opts) -> DataFrame: # Use a manager to handle memory accessed across processes manager = Manager() # Get all the weather stations with data up until last month from inventory today = datetime.date.today() min_date = (today - datetime.timedelta(days=30)).strftime("%Y%m%d") stations = read_file( sources["inventory"]).rename(columns={ "LAT": "lat", "LON": "lon", "ELEV(M)": "elevation" }) stations = stations[stations.END > int(min_date)] stations["id"] = stations["USAF"] + stations["WBAN"].apply( lambda x: f"{x:05d}") # Open the station data as a compressed file with tarfile.open(sources["gsod"], mode="r:gz") as stations_tar: # Build the station cache by decompressing all files in memory station_cache = manager.dict() for member in pbar(stations_tar.getmembers(), desc="Decompressing"): if not member.name.endswith(".csv"): continue # Read the records from the provided station data = read_file( stations_tar.extractfile(member), file_type="csv", usecols=_COLUMN_MAPPING.keys(), ).rename(columns=_COLUMN_MAPPING) # Fix data types data["noaa_station"] = data["noaa_station"].astype(str) data["rainfall"] = data["rainfall"].apply(conv_dist) data["snowfall"] = data["snowfall"].apply(conv_dist) data["dew_point"] = data["dew_point"].apply(conv_temp) for temp_type in ("average", "minimum", "maximum"): col = f"{temp_type}_temperature" data[col] = data[col].apply(conv_temp) # Compute the relative humidity from the dew point and average temperature data["relative_humidity"] = data.apply( lambda x: relative_humidity(x["average_temperature"], x[ "dew_point"]), axis=1) station_cache[member.name.replace(".csv", "")] = data # Get all the POI from metadata and go through each key keep_columns = ["key", "latitude", "longitude"] metadata = read_file(sources["geography"])[keep_columns].dropna() # Only use keys present in the metadata table metadata = metadata.merge(aux["metadata"])[keep_columns] # Convert all coordinates to radians stations["lat"] = stations["lat"].apply(math.radians) stations["lon"] = stations["lon"].apply(math.radians) metadata["lat"] = metadata["latitude"].apply(math.radians) metadata["lon"] = metadata["longitude"].apply(math.radians) # Make sure the stations and the cache are sent to each function call map_func = partial(_process_location, station_cache, stations) # We don't care about the index while iterating over each metadata item map_iter = (record for _, record in metadata.iterrows()) # Bottleneck is network so we can use lots of threads in parallel records = process_map(map_func, map_iter, total=len(metadata)) return concat(records)
def make_main_table(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) print("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) print("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) print("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path) print("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path) shutil.move(temp_file_path, main_table_path) print(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) print("Sorted main table")