def test_table_grouped_tail_real_data(self): with temporary_directory() as workdir: for table_path in (SRC / "test" / "data").glob("*.csv"): table = read_table(table_path, schema=SCHEMA) test_output = workdir / f"latest_{table_path.name}" pandas_output = workdir / f"latest_pandas_{table_path.name}" # Create the latest slice of the given table table_grouped_tail(table_path, test_output, ["key"]) # Create a latest slice using pandas grouping table = table.groupby("key").aggregate( agg_last_not_null).reset_index() export_csv(table, path=pandas_output, schema=SCHEMA) # Converting to a CSV in memory sometimes produces out-of-order values with open_file_like(test_output) as fd1, open_file_like( pandas_output) as fd2: test_result_lines = list(sorted(fd1)) pandas_result_lines = list(sorted(fd2)) self.assertEqual(len(test_result_lines), len(pandas_result_lines)) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)
def fetch( self, output_folder: Path, cache: Dict[str, str], fetch_opts: List[Dict[str, Any]], skip_existing: bool = False, ) -> Dict[str, str]: # The URL is just a template which we'll use to download each state base_opts = dict(fetch_opts[0]) url_tpl = base_opts.pop("url") # Some states cannot be found in the dataset states_banlist = ["AS", "GU", "MP", "PR", "VI"] states = read_table(SRC / "data" / "metadata.csv") states = states.loc[states["country_code"] == "US", "subregion1_code"].dropna().unique() states = [state for state in states if state not in states_banlist] opts = [ dict(**base_opts, name=code, url=url_tpl.format(state=code)) for code in states ] return super().fetch(output_folder=output_folder, cache=cache, fetch_opts=opts, skip_existing=skip_existing)
def load_combined_table(pipeline: DataPipeline, prod_folder: str) -> DataFrame: table_name = pipeline.table with temporary_directory() as workdir: output_path = workdir / f"{table_name}.csv" download_file(GCS_BUCKET_PROD, f"{prod_folder}/{table_name}.csv", output_path) combined_table = read_table(output_path) index_columns = (["date"] if "date" in combined_table.columns else []) + ["location_key"] return combined_table.set_index(index_columns)
def test_table_sort(self): test_csv = """col1,col2,col3 a,1,foo d,4,bar c,3,foo b,2,bar """ with TemporaryDirectory() as workdir: workdir = Path(workdir) input_file = workdir / "in.csv" with open(input_file, "w") as fd: for line in test_csv.split("\n"): if not line.isspace(): fd.write(f"{line.strip()}\n") # Sort using the default (first) column output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values(["col1"]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip()) # Sort by each column in order for sort_column in ("col1", "col2", "col3"): output_file_1 = workdir / "out.csv" table_sort(input_file, output_file_1, [sort_column]) output_file_2 = workdir / "pandas.csv" read_table(input_file).sort_values([sort_column ]).to_csv(output_file_2, index=False) for line1, line2 in zip(read_lines(output_file_1), read_lines(output_file_2)): self.assertEqual(line1.strip(), line2.strip())
def _test_make_main_table_helper(self, main_table_path: Path, column_adapter: Dict[str, str]): main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): for column_name in pipeline.schema.keys(): column_name = column_adapter.get(column_name) if column_name is not None: self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make sure that all columns present in the index table are in the main table main_table_columns = set(get_table_columns(main_table_path)) index_table_columns = set( get_table_columns(SRC / "test" / "data" / "index.csv")) for column in index_table_columns: column = column_adapter.get(column, column) self.assertTrue(column in main_table_columns, f"{column} not in main") # Make the main table easier to deal with since we optimize for memory usage location_key = "location_key" if "location_key" in main_table.columns else "key" main_table.set_index(location_key, inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check column_prefixes = ("new", "total", "cumulative") column_filter = lambda col: col.split("_")[ 0] in column_prefixes and "age" not in col columns = list(filter(column_filter, main_table.columns)) self.assertGreaterEqual(len({col.split("_")[0] for col in columns}), 2) main_table = main_table[["date"] + columns] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", "2020-09-01", "2020-12-31") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", "2020-09-01", "2020-12-31") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", "2020-09-01", "2020-12-31")
def test_make_main_table(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) # Copy all test tables into the temporary directory copy_tables(SRC / "test" / "data", workdir) # Create the main table main_table_path = workdir / "main.csv" make_main_table(workdir, main_table_path) main_table = read_table(main_table_path, schema=SCHEMA) # Verify that all columns from all tables exist for pipeline in get_pipelines(): if pipeline.table in EXCLUDE_FROM_MAIN_TABLE: continue for column_name in pipeline.schema.keys(): self.assertTrue( column_name in main_table.columns, f"Column {column_name} missing from main table", ) # Main table should follow a lexical sort (outside of header) main_table_records = [] for line in read_lines(main_table_path): main_table_records.append(line) main_table_records = main_table_records[1:] self.assertListEqual(main_table_records, list(sorted(main_table_records))) # Make the main table easier to deal with since we optimize for memory usage main_table.set_index("key", inplace=True) main_table["date"] = main_table["date"].astype(str) # Define sets of columns to check epi_basic = [ "new_confirmed", "total_confirmed", "new_deceased", "total_deceased" ] # Spot check: Country of Andorra self._spot_check_subset(main_table, "AD", epi_basic, "2020-03-02", "2020-09-01") # Spot check: State of New South Wales self._spot_check_subset(main_table, "AU_NSW", epi_basic, "2020-01-25", "2020-09-01") # Spot check: Alachua County self._spot_check_subset(main_table, "US_FL_12001", epi_basic, "2020-03-10", "2020-09-01")
def test_table_sort(self): test_csv = _make_test_csv_file( """ col1,col2,col3 a,1,foo d,4,bar c,3,foo b,2,bar """ ) with temporary_directory() as workdir: # Sort using the default (first) column output_file_1 = workdir / "out.csv" table_sort(test_csv, output_file_1) test_csv.seek(0) output_file_2 = workdir / "pandas.csv" read_table(test_csv, file_type="csv").sort_values(["col1"]).to_csv( output_file_2, index=False ) _compare_tables_equal(self, output_file_1, output_file_2) # Sort by each column in order for sort_column in ("col1", "col2", "col3"): output_file_1 = workdir / f"1.{sort_column}.csv" table_sort(test_csv, output_file_1, [sort_column]) test_csv.seek(0) output_file_2 = workdir / f"2.{sort_column}.csv" read_table(test_csv, file_type="csv").sort_values([sort_column]).to_csv( output_file_2, index=False ) _compare_tables_equal(self, output_file_1, output_file_2)
def load_intermediate_tables( pipeline: DataPipeline, column_adapter: List[str], index_columns: List[str]) -> Iterable[Tuple[DataSource, DataFrame]]: with temporary_directory() as workdir: for data_source in tqdm(pipeline.data_sources, desc="Downloading intermediate tables"): fname = data_source.uuid(pipeline.table) + ".csv" try: download_file(GCS_BUCKET_TEST, f"intermediate/{fname}", workdir / fname) table = read_table(workdir / fname).rename(columns=column_adapter) table = table.groupby(index_columns).last() yield (data_source, table) except Exception as exc: print(f"intermediate table not found: {fname}", file=sys.stderr)
def test_table_group_tail(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) for table_path in (SRC / "test" / "data").glob("*.csv"): table = read_table(table_path, schema=SCHEMA) test_output_path = workdir / f"latest_{table_path.name}" pandas_output_path = workdir / f"latest_pandas_{table_path.name}" # Create the latest slice of the given table table_group_tail(table_path, test_output_path) # Create a latest slice using pandas grouping table = table.groupby("key").aggregate(agg_last_not_null).reset_index() export_csv(table, path=pandas_output_path, schema=SCHEMA) # Converting to a CSV in memory sometimes produces out-of-order values test_result_lines = sorted(read_lines(test_output_path)) pandas_result_lines = sorted(read_lines(pandas_output_path)) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)