def _test_join_all(self, how_mem: str, how_pandas: str): # Create a custom function used to read tables casting to the expected schema read_table_ = partial(read_table, schema=SCHEMA, low_memory=False) # Test joining the index table with every other table left = SRC / "test" / "data" / "index.csv" for right in pbar([*(SRC / "test" / "data").glob("*.csv")], leave=False): if left.name == right.name: continue left_columns = get_table_columns(left) right_columns = get_table_columns(right) if not "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key"], how_mem, how_pandas) if "date" in left_columns and not "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key"], how_mem, how_pandas) if "date" in left_columns and "date" in right_columns: self._test_join_pair(read_table_, SCHEMA, left, right, ["key", "date"], how_mem, how_pandas)
def _test_make_main_table_helper(self, main_table_path: Path, column_adapter: Dict[str, str]): main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): for column_name in pipeline.schema.keys(): column_name = column_adapter.get(column_name) if column_name is not None: self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make sure that all columns present in the index table are in the main table main_table_columns = set(get_table_columns(main_table_path)) index_table_columns = set( get_table_columns(SRC / "test" / "data" / "index.csv")) for column in index_table_columns: column = column_adapter.get(column, column) self.assertTrue(column in main_table_columns, f"{column} not in main") # Make the main table easier to deal with since we optimize for memory usage location_key = "location_key" if "location_key" in main_table.columns else "key" main_table.set_index(location_key, inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check column_prefixes = ("new", "total", "cumulative") column_filter = lambda col: col.split("_")[ 0] in column_prefixes and "age" not in col columns = list(filter(column_filter, main_table.columns)) self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2) main_table = main_table[["date"] + columns] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", "2020-09-01", "2020-12-31") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01", "2020-12-31")
def _make_location_key_and_date_table(index_table: Path, output_path: Path) -> None: # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Make sure that there is an index table present assert index_table.exists(), "Index table not found" # Index table will determine if we use "key" or "location_key" as column name index_columns = get_table_columns(index_table) location_key = "location_key" if "location_key" in index_columns else "key" # Create a single-column table with only the keys keys_table_path = workdir / "location_keys.csv" with open(keys_table_path, "w") as fd: fd.write(f"{location_key}\n") fd.writelines( f"{value}\n" for value in table_read_column(index_table, location_key)) # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_table_path = workdir / "dates.csv" with open(date_table_path, "w") as fd: fd.write("date\n") fd.writelines(f"{value}\n" for value in date_range("2020-01-01", max_date)) # Output all combinations of <key x date> table_cross_product(keys_table_path, date_table_path, output_path)
def _compare_tables_equal(self, table1: Path, table2: Path) -> None: cols1 = get_table_columns(table1) cols2 = get_table_columns(table2) self.assertEqual(set(cols1), set(cols2)) # Converting to a CSV in memory sometimes produces out-of-order values records1 = list(read_lines(table1)) records2 = list(read_lines(table2)) self.assertEqual(len(records1), len(records2)) reader1 = csv.reader(records1) reader2 = csv.reader(records2) for record1, record2 in zip(reader1, reader2): record1 = {col: val for col, val in zip(cols1, record1)} record2 = {col: val for col, val in zip(cols2, record2)} self.assertEqual(record1, record2)
def test_table_file_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" # Verify that all tables were imported with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, table_name=table_name) self._check_table_not_empty(conn, table_name) # Dirty hack used to compare appropriate column names. Ideally this would be # handled by the SQL module, which should convert the table and column names to # whatever they were prior to sanitizing them. temp_file_path_1 = workdir / f"{table_name}.1.csv" column_adapter = { col: _safe_column_name(col).replace("[", "").replace("]", "") for col in get_table_columns(table_path) } table_rename(table_path, temp_file_path_1, column_adapter) temp_file_path_2 = workdir / f"{table_name}.2.csv" table_export_csv(conn, table_name, temp_file_path_2) _compare_tables_equal(self, temp_file_path_1, temp_file_path_2)
def read_source_output(data_pipeline: DataPipeline, data_source: DataSource) -> DataFrame: with temporary_directory() as workdir: output_path = workdir / f"{data_source.uuid(data_pipeline.table)}.csv" try: download_file(GCS_BUCKET_TEST, f"intermediate/{output_path.name}", output_path) columns = get_table_columns(output_path) dates = list(table_read_column( output_path, "date")) if "date" in columns else [None] return { "pipeline": data_pipeline.name, "data_source": f"{data_source.__module__}.{data_source.name}", "columns": ",".join(columns), "first_date": min(dates), "last_date": max(dates), "location_keys": ",".join(sorted(set(table_read_column(output_path, "key")))), } except Exception as exc: print(exc, file=sys.stderr) return []
def _compare_tables_equal(test_case: ProfiledTestCase, table1: Path, table2: Path) -> None: cols1 = get_table_columns(table1) cols2 = get_table_columns(table2) test_case.assertEqual(set(cols1), set(cols2)) # Converting to a CSV in memory sometimes produces out-of-order values with open_file_like(table1) as fd1, open_file_like(table2) as fd2: records1 = list(line_reader(fd1, skip_empty=True)) records2 = list(line_reader(fd2, skip_empty=True)) test_case.assertEqual(len(records1), len(records2)) reader1 = csv.reader(records1) reader2 = csv.reader(records2) for record1, record2 in zip(reader1, reader2): record1 = {col: val for col, val in zip(cols1, record1)} record2 = {col: val for col, val in zip(cols2, record2)} test_case.assertEqual(record1, record2)
def _subset_latest(output_folder: Path, csv_file: Path) -> Path: output_file = output_folder / csv_file.name # Degenerate case: table has no "date" column columns = get_table_columns(csv_file) if "date" not in columns: shutil.copyfile(csv_file, output_file) else: table_grouped_tail(csv_file, output_file, ["key"]) return output_file
def _latest_date_by_group(tables_folder: Path, group_by: str = "location_key") -> Dict[str, str]: groups: Dict[str, str] = {} for table_file in tables_folder.glob("*.csv"): table_columns = get_table_columns(table_file) if "date" in table_columns: iter1 = table_read_column(table_file, "date") iter2 = table_read_column(table_file, group_by) for date, key in zip(iter1, iter2): groups[key] = max(groups.get(key, date), date) return groups
def publish_subset_latest(tables_folder: Path, output_folder: Path, key: str = "location_key", **tqdm_kwargs) -> List[Path]: """ This method outputs the latest record by date per location key for each of the input tables. Arguments: tables_folder: Directory containing input CSV files. output_folder: Output path for the resulting data. key: Column name to group by. """ agg_table_name = "aggregated" # Create a latest subset version for each of the tables in parallel map_iter = [ table for table in tables_folder.glob("*.csv") if table.stem != agg_table_name ] _logger.log_info(f"Computing latest subset for {len(map_iter)} tables") map_opts = dict(total=len(map_iter), desc="Creating latest subsets", **tqdm_kwargs) map_func = partial(_grouped_subset_latest, output_folder, group_column=key) for table in pbar(map(map_func, map_iter), **map_opts): yield table # Use a temporary directory for intermediate files with temporary_directory() as workdir: latest_dates_table = workdir / "dates.csv" latest_dates_map = _latest_date_by_group(output_folder, group_by=key) with open(latest_dates_table, "w") as fh: fh.write("location_key,date\n") for location_key, date in latest_dates_map.items(): fh.write(f"{location_key},{date}\n") join_table_paths = [latest_dates_table] tables_in = (table for table in output_folder.glob("*.csv") if table.stem in V3_TABLE_LIST) for table_file in tables_in: table_columns = get_table_columns(table_file) if "date" not in table_columns: join_table_paths.append(table_file) else: tmp_file = workdir / table_file.name table_rename(table_file, tmp_file, {"date": None}) join_table_paths.append(tmp_file) # Join them all into a single file for the aggregate version output_agg = output_folder / f"{agg_table_name}.csv" table_merge(join_table_paths, output_agg, on=[key], how="OUTER") yield output_agg
def merge_output_tables( tables_folder: Path, output_path: Path, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: tables_folder: Input directory where all CSV files exist. output_path: Output directory for the resulting main.csv file. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. exclude_table_names: Tables which should be removed from the combined output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use temporary files to avoid computing everything in memory temp_input = workdir / "tmp.1.csv" temp_output = workdir / "tmp.2.csv" # Start with all combinations of <location key x date> _make_location_key_and_date_table(tables_folder / "index.csv", temp_output) temp_input, temp_output = temp_output, temp_input for table_file_path in table_paths: # Join by <location key> or <location key x date> depending on what's available table_columns = get_table_columns(table_file_path) join_on = [col for col in ("key", "location_key", "date") if col in table_columns] # Iteratively perform left outer joins on all tables table_join(temp_input, table_file_path, join_on, temp_output, how="outer") # Flip-flop the temp files to avoid a copy temp_input, temp_output = temp_output, temp_input # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Remove columns which provide no data because they are only null values if drop_empty_columns: table_drop_nan_columns(temp_input, temp_output) temp_input, temp_output = temp_output, temp_input # Ensure that the table is appropriately sorted and write to output location table_sort(temp_input, output_path)
def import_tables_into_sqlite(table_paths: List[Path], output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting SQLite file. """ # Import all tables into a database on disk at the provided path with create_sqlite_database(output_path) as conn: # Get a list of all tables indexed by <location_key> or by <location_key, date> schema = get_schema() for table_file_path in table_paths: table_name = table_file_path.stem _logger.log_info(f"Importing {table_name} into SQLite") table_columns = get_table_columns(table_file_path) table_schema = {col: schema.get(col, str) for col in table_columns} table_import_from_file(conn, table_file_path, table_name=table_name, schema=table_schema)
def make_main_table(tables_folder: Path, output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: tables_folder: Input folder where all CSV files exist. Returns: DataFrame: Flat table with all data combined. """ # Use a temporary directory for intermediate files with TemporaryDirectory() as workdir: workdir = Path(workdir) # Merge all output files into a single table keys_table_path = workdir / "keys.csv" keys_table = read_file(tables_folder / "index.csv", usecols=["key"]) export_csv(keys_table, keys_table_path) print("Created keys table") # Add a date to each region from index to allow iterative left joins max_date = (datetime.datetime.now() + datetime.timedelta(days=1)).date().isoformat() date_list = [ date.date().isoformat() for date in date_range("2020-01-01", max_date) ] date_table_path = workdir / "dates.csv" export_csv(DataFrame(date_list, columns=["date"]), date_table_path) print("Created dates table") # Create a temporary working table file which can be used during the steps temp_file_path = workdir / "main.tmp.csv" table_cross_product(keys_table_path, date_table_path, temp_file_path) print("Created cross product table") # Add all the index columns to seed the main table main_table_path = workdir / "main.csv" table_join(temp_file_path, tables_folder / "index.csv", ["key"], main_table_path) print("Joined with table index") non_dated_columns = set(get_table_columns(main_table_path)) for table_file_path in pbar([*tables_folder.glob("*.csv")], desc="Make main table"): table_name = table_file_path.stem if table_name not in EXCLUDE_FROM_MAIN_TABLE: table_columns = get_table_columns(table_file_path) if "date" in table_columns: join_on = ["key", "date"] else: join_on = ["key"] # Keep track of columns which are not indexed by date non_dated_columns = non_dated_columns | set(table_columns) # Iteratively perform left outer joins on all tables table_join(main_table_path, table_file_path, join_on, temp_file_path) shutil.move(temp_file_path, main_table_path) print(f"Joined with table {table_name}") # Drop rows with null date or without a single dated record # TODO: figure out a memory-efficient way to do this # Ensure that the table is appropriately sorted ans write to output location table_sort(main_table_path, output_path) print("Sorted main table")
def merge_output_tables_sqlite( tables_folder: Path, output_path: Path, sqlite_file: Path = None, drop_empty_columns: bool = False, use_table_names: List[str] = None, ) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. This function requires index.csv to be present under `tables_folder`. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting CSV file. sqlite_path: Path for the SQLite database to use for importing data, defaults to a temporary database on disk. drop_empty_columns: Flag determining whether columns with null values only should be removed from the output. """ # Default to a known list of tables to use when none is given table_paths = _get_tables_in_folder(tables_folder, use_table_names or V2_TABLE_LIST) # Use a temporary directory for intermediate files with temporary_directory() as workdir: # Use two temporary tables as I/O for intermediate operations temp_table_input, temp_table_output = "tmp_table_name_1", "tmp_table_name_2" # Start with all combinations of <location key x date> keys_and_dates_table_path = workdir / f"{temp_table_input}.csv" _logger.log_info("Creating keys and dates table") index_table = [ table for table in table_paths if table.stem == "index" ][0] _make_location_key_and_date_table(index_table, keys_and_dates_table_path) # Create an SQLite database _logger.log_info("Importing all tables into SQLite") database_file = sqlite_file or workdir / "database.sqlite" import_tables_into_sqlite([keys_and_dates_table_path] + table_paths, database_file) with create_sqlite_database(database_file) as conn: _logger.log_info(f"Merging all tables into a flat output") for table in table_paths: _logger.log_info(f"Merging {table.stem}") # Read the table's header to determine how to merge it table_name = _safe_table_name(table.stem) table_columns = get_table_columns(table) join_on = [ col for col in ("key", "location_key", "date") if col in table_columns ] # Join with the current intermediate table sql_table_join( conn, left=temp_table_input, right=table_name, on=join_on, how="left outer", into_table=temp_table_output, ) # Flip-flop the I/O tables to avoid a copy temp_table_input, temp_table_output = temp_table_output, temp_table_input sort_values = ("location_key", "date") _logger.log_info(f"Exporting output as CSV") sql_export_csv(conn, temp_table_input, output_path=output_path, sort_by=sort_values) # Remove the intermediate tables from the SQLite database sql_table_drop(conn, temp_table_input) sql_table_drop(conn, temp_table_output)