예제 #1
0
def _make_location_key_and_date_table(index_table: Path,
                                      output_path: Path) -> None:
    # Use a temporary directory for intermediate files
    with temporary_directory() as workdir:

        # Make sure that there is an index table present
        assert index_table.exists(), "Index table not found"

        # Index table will determine if we use "key" or "location_key" as column name
        index_columns = get_table_columns(index_table)
        location_key = "location_key" if "location_key" in index_columns else "key"

        # Create a single-column table with only the keys
        keys_table_path = workdir / "location_keys.csv"
        with open(keys_table_path, "w") as fd:
            fd.write(f"{location_key}\n")
            fd.writelines(
                f"{value}\n"
                for value in table_read_column(index_table, location_key))

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_table_path = workdir / "dates.csv"
        with open(date_table_path, "w") as fd:
            fd.write("date\n")
            fd.writelines(f"{value}\n"
                          for value in date_range("2020-01-01", max_date))

        # Output all combinations of <key x date>
        table_cross_product(keys_table_path, date_table_path, output_path)
예제 #2
0
    def test_cross_product(self):
        csv1 = """col1,col2
        a,1
        b,2
        c,3
        d,4
        """

        csv2 = """col3,col4
        1,a
        2,b
        3,c
        4,d
        """

        expected = """col1,col2,col3,col4
        a,1,1,a
        a,1,2,b
        a,1,3,c
        a,1,4,d
        b,2,1,a
        b,2,2,b
        b,2,3,c
        b,2,4,d
        c,3,1,a
        c,3,2,b
        c,3,3,c
        c,3,4,d
        d,4,1,a
        d,4,2,b
        d,4,3,c
        d,4,4,d
        """

        with TemporaryDirectory() as workdir:
            workdir = Path(workdir)
            with open(workdir / "1.csv", "w") as fd:
                for line in csv1.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")
            with open(workdir / "2.csv", "w") as fd:
                for line in csv2.split("\n"):
                    if not line.isspace():
                        fd.write(f"{line.strip()}\n")

            output_file = workdir / "out.csv"
            table_cross_product(workdir / "1.csv", workdir / "2.csv",
                                output_file)

            for line1, line2 in zip(expected.split("\n"),
                                    read_lines(output_file)):
                self.assertEqual(line1.strip(), line2.strip())
예제 #3
0
    def test_cross_product(self):
        csv1 = _make_test_csv_file(
            """
            col1,col2
            a,1
            b,2
            c,3
            d,4
            """
        )

        csv2 = _make_test_csv_file(
            """
            col3,col4
            1,a
            2,b
            3,c
            4,d
            """
        )

        expected = _make_test_csv_file(
            """
            col1,col2,col3,col4
            a,1,1,a
            a,1,2,b
            a,1,3,c
            a,1,4,d
            b,2,1,a
            b,2,2,b
            b,2,3,c
            b,2,4,d
            c,3,1,a
            c,3,2,b
            c,3,3,c
            c,3,4,d
            d,4,1,a
            d,4,2,b
            d,4,3,c
            d,4,4,d
            """
        )

        with temporary_file() as output_file:
            table_cross_product(csv1, csv2, output_file)
            _compare_tables_equal(self, output_file, expected)
예제 #4
0
def make_main_table(tables_folder: Path, output_path: Path) -> None:
    """
    Build a flat view of all tables combined, joined by <key> or <key, date>.

    Arguments:
        tables_folder: Input folder where all CSV files exist.
    Returns:
        DataFrame: Flat table with all data combined.
    """

    # Use a temporary directory for intermediate files
    with TemporaryDirectory() as workdir:
        workdir = Path(workdir)

        # Merge all output files into a single table
        keys_table_path = workdir / "keys.csv"
        keys_table = read_file(tables_folder / "index.csv", usecols=["key"])
        export_csv(keys_table, keys_table_path)
        print("Created keys table")

        # Add a date to each region from index to allow iterative left joins
        max_date = (datetime.datetime.now() +
                    datetime.timedelta(days=1)).date().isoformat()
        date_list = [
            date.date().isoformat()
            for date in date_range("2020-01-01", max_date)
        ]
        date_table_path = workdir / "dates.csv"
        export_csv(DataFrame(date_list, columns=["date"]), date_table_path)
        print("Created dates table")

        # Create a temporary working table file which can be used during the steps
        temp_file_path = workdir / "main.tmp.csv"
        table_cross_product(keys_table_path, date_table_path, temp_file_path)
        print("Created cross product table")

        # Add all the index columns to seed the main table
        main_table_path = workdir / "main.csv"
        table_join(temp_file_path, tables_folder / "index.csv", ["key"],
                   main_table_path)
        print("Joined with table index")

        non_dated_columns = set(get_table_columns(main_table_path))
        for table_file_path in pbar([*tables_folder.glob("*.csv")],
                                    desc="Make main table"):
            table_name = table_file_path.stem
            if table_name not in EXCLUDE_FROM_MAIN_TABLE:

                table_columns = get_table_columns(table_file_path)
                if "date" in table_columns:
                    join_on = ["key", "date"]
                else:
                    join_on = ["key"]

                    # Keep track of columns which are not indexed by date
                    non_dated_columns = non_dated_columns | set(table_columns)

                # Iteratively perform left outer joins on all tables
                table_join(main_table_path, table_file_path, join_on,
                           temp_file_path)
                shutil.move(temp_file_path, main_table_path)
                print(f"Joined with table {table_name}")

        # Drop rows with null date or without a single dated record
        # TODO: figure out a memory-efficient way to do this

        # Ensure that the table is appropriately sorted ans write to output location
        table_sort(main_table_path, output_path)
        print("Sorted main table")