def _make_location_key_and_date_table(index_table: Path, output_path: Path) -> None: # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Make sure that there is an index table present assert index_table.exists(), "Index table not found" # Index table will determine if we use "key" or "location_key" as column name index_columns = get_table_columns(index_table) location_key = "location_key" if "location_key" in index_columns else "key" # Create a single-column table with only the keys keys_table_path = workdir / "location_keys.csv" with open(keys_table_path, "w") as fd: fd.write(f"{location_key}\n") fd.writelines( f"{value}\n" for value in table_read_column(index_table, location_key)) # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_table_path = workdir / "dates.csv" with open(date_table_path, "w") as fd: fd.write("date\n") fd.writelines(f"{value}\n" for value in date_range("2020-01-01", max_date)) # Output all combinations of <key x date> table_cross_product(keys_table_path, date_table_path, output_path)
def test_cross_product(self): csv1 = """col1,col2 a,1 b,2 c,3 d,4 """ csv2 = """col3,col4 1,a 2,b 3,c 4,d """ expected = """col1,col2,col3,col4 a,1,1,a a,1,2,b a,1,3,c a,1,4,d b,2,1,a b,2,2,b b,2,3,c b,2,4,d c,3,1,a c,3,2,b c,3,3,c c,3,4,d d,4,1,a d,4,2,b d,4,3,c d,4,4,d """ with TemporaryDirectory() as workdir: workdir = Path(workdir) with open(workdir / "1.csv", "w") as fd: for line in csv1.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") with open(workdir / "2.csv", "w") as fd: for line in csv2.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") output_file = workdir / "out.csv" table_cross_product(workdir / "1.csv", workdir / "2.csv", output_file) for line1, line2 in zip(expected.split("\n"), read_lines(output_file)): self.assertEqual(line1.strip(), line2.strip())
def test_cross_product(self): csv1 = _make_test_csv_file( """ col1,col2 a,1 b,2 c,3 d,4 """ ) csv2 = _make_test_csv_file( """ col3,col4 1,a 2,b 3,c 4,d """ ) expected = _make_test_csv_file( """ col1,col2,col3,col4 a,1,1,a a,1,2,b a,1,3,c a,1,4,d b,2,1,a b,2,2,b b,2,3,c b,2,4,d c,3,1,a c,3,2,b c,3,3,c c,3,4,d d,4,1,a d,4,2,b d,4,3,c d,4,4,d """ ) with temporary_file() as output_file: table_cross_product(csv1, csv2, output_file) _compare_tables_equal(self, output_file, expected)
def make_main_table(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) print("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) print("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) print("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path) print("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path) shutil.move(temp_file_path, main_table_path) print(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) print("Sorted main table")