Exemplo n.º 1
0
    def test_table_grouped_tail_real_data(self):
        with temporary_directory() as workdir:

            for table_path in (SRC / "test" / "data").glob("*.csv"):
                table = read_table(table_path, schema=SCHEMA)
                test_output = workdir / f"latest_{table_path.name}"
                pandas_output = workdir / f"latest_pandas_{table_path.name}"

                # Create the latest slice of the given table
                table_grouped_tail(table_path, test_output, ["key"])

                # Create a latest slice using pandas grouping
                table = table.groupby("key").aggregate(
                    agg_last_not_null).reset_index()
                export_csv(table, path=pandas_output, schema=SCHEMA)

                # Converting to a CSV in memory sometimes produces out-of-order values
                with open_file_like(test_output) as fd1, open_file_like(
                        pandas_output) as fd2:
                    test_result_lines = list(sorted(fd1))
                    pandas_result_lines = list(sorted(fd2))

                self.assertEqual(len(test_result_lines),
                                 len(pandas_result_lines))
                for line1, line2 in zip(test_result_lines,
                                        pandas_result_lines):
                    self.assertEqual(line1, line2)
Exemplo n.º 2
0
    def test_table_grouped_tail_synthetic(self):
        test_csv = _make_test_csv_file(
            """
            col1,col2,col3
            a,1,foo
            a,2,bar
            b,1,foo
            b,2,baz
            c,1,foo
            c,2,
            """
        )

        expected = _make_test_csv_file(
            """
            col1,col2,col3
            a,2,bar
            b,2,baz
            c,2,foo
            """
        )

        with temporary_file() as output_file:
            table_grouped_tail(test_csv, output_path=output_file, group_by=["col1"])
            _compare_tables_equal(self, output_file, expected)
Exemplo n.º 3
0
def _subset_latest(output_folder: Path, csv_file: Path) -> Path:
    output_file = output_folder / csv_file.name
    # Degenerate case: table has no "date" column
    columns = get_table_columns(csv_file)
    if "date" not in columns:
        shutil.copyfile(csv_file, output_file)
    else:
        table_grouped_tail(csv_file, output_file, ["key"])
    return output_file