def test_table_grouped_tail_real_data(self): with temporary_directory() as workdir: for table_path in (SRC / "test" / "data").glob("*.csv"): table = read_table(table_path, schema=SCHEMA) test_output = workdir / f"latest_{table_path.name}" pandas_output = workdir / f"latest_pandas_{table_path.name}" # Create the latest slice of the given table table_grouped_tail(table_path, test_output, ["key"]) # Create a latest slice using pandas grouping table = table.groupby("key").aggregate( agg_last_not_null).reset_index() export_csv(table, path=pandas_output, schema=SCHEMA) # Converting to a CSV in memory sometimes produces out-of-order values with open_file_like(test_output) as fd1, open_file_like( pandas_output) as fd2: test_result_lines = list(sorted(fd1)) pandas_result_lines = list(sorted(fd2)) self.assertEqual(len(test_result_lines), len(pandas_result_lines)) for line1, line2 in zip(test_result_lines, pandas_result_lines): self.assertEqual(line1, line2)
def test_table_grouped_tail_synthetic(self): test_csv = _make_test_csv_file( """ col1,col2,col3 a,1,foo a,2,bar b,1,foo b,2,baz c,1,foo c,2, """ ) expected = _make_test_csv_file( """ col1,col2,col3 a,2,bar b,2,baz c,2,foo """ ) with temporary_file() as output_file: table_grouped_tail(test_csv, output_path=output_file, group_by=["col1"]) _compare_tables_equal(self, output_file, expected)
def _subset_latest(output_folder: Path, csv_file: Path) -> Path: output_file = output_folder / csv_file.name # Degenerate case: table has no "date" column columns = get_table_columns(csv_file) if "date" not in columns: shutil.copyfile(csv_file, output_file) else: table_grouped_tail(csv_file, output_file, ["key"]) return output_file