def convert_tables_to_json(csv_folder: Path, output_folder: Path) -> Iterable[Path]: def try_json_covert(schema: Dict[str, str], csv_file: Path) -> Path: # JSON output defaults to same as the CSV file but with extension swapped json_output = output_folder / str( csv_file.relative_to(csv_folder)).replace(".csv", ".json") json_output.parent.mkdir(parents=True, exist_ok=True) # Converting to JSON is not critical and it may fail in some corner cases # As long as the "important" JSON files are created, this should be OK try: print(f"Converting {csv_file} to JSON") convert_csv_to_json_records(schema, csv_file, json_output) return json_output except Exception as exc: print(f"Unable to convert CSV file {csv_file} to JSON: ${exc}", file=sys.stderr) traceback.print_exc() return None # Convert all CSV files to JSON using values format map_iter = list(csv_folder.glob("**/*.csv")) map_func = partial(try_json_covert, get_schema()) for json_output in thread_map(map_func, map_iter, max_workers=2, desc="JSON conversion"): if json_output is not None: yield json_output
def test_table_records_reimport(self): with TemporaryDirectory() as workdir: workdir = Path(workdir) schema = { _safe_column_name(col): dtype for col, dtype in get_schema().items() } sqlite_file = workdir / "tmp.sqlite" tables_folder = SRC / "test" / "data" with create_sqlite_database(db_file=sqlite_file) as conn: for table_path in tables_folder.glob("*.csv"): table_name = _safe_table_name(table_path.stem) table_import_from_file(conn, table_path, schema=schema) # Export the records to a list records_output_1 = list(table_select_all(conn, table_name)) # Import the list of records table_name_2 = table_name + "_new" table_import_from_records(conn, table_name_2, records_output_1, schema=schema) # Re-export the records as a list records_output_2 = list( table_select_all(conn, table_name_2)) for record1, record2 in zip(records_output_1, records_output_2): self.assertDictEqual(record1, record2)
def convert_tables_to_json(csv_folder: Path, output_folder: Path, **tqdm_kwargs) -> Iterable[Path]: # Convert all CSV files to JSON using values format map_iter = list(csv_folder.glob("**/*.csv")) map_opts = dict(total=len(map_iter), desc="Converting to JSON", **tqdm_kwargs) map_func = partial(_try_json_covert, get_schema(), csv_folder, output_folder) return list(pbar(map(map_func, map_iter), **map_opts))
def import_tables_into_sqlite(table_paths: List[Path], output_path: Path) -> None: """ Build a flat view of all tables combined, joined by <key> or <key, date>. Arguments: table_paths: List of CSV files to join into a single table. output_path: Output path for the resulting SQLite file. """ # Import all tables into a database on disk at the provided path with create_sqlite_database(output_path) as conn: # Get a list of all tables indexed by <location_key> or by <location_key, date> schema = get_schema() for table_file_path in table_paths: table_name = table_file_path.stem _logger.log_info(f"Importing {table_name} into SQLite") table_columns = get_table_columns(table_file_path) table_schema = {col: schema.get(col, str) for col in table_columns} table_import_from_file(conn, table_file_path, table_name=table_name, schema=table_schema)
from pandas import DataFrame from lib.constants import SRC from lib.io import read_lines, read_table, export_csv from lib.memory_efficient import ( table_cross_product, table_join, table_group_tail, _convert_csv_to_json_records_fast, _convert_csv_to_json_records_slow, ) from lib.pipeline_tools import get_schema from lib.utils import agg_last_not_null, pbar from .profiled_test_case import ProfiledTestCase # Read the expected dtypes to ensure casting does not throw off test results SCHEMA = get_schema() class TestTableJoins(ProfiledTestCase): def _test_join_pair( self, read_table_: Callable, schema: Dict[str, str], left: Path, right: Path, on: List[str], how: str, ): with TemporaryDirectory() as workdir: workdir = Path(workdir) tmpfile = workdir / "tmpfile.csv"
def main(): schema = get_schema() for table_name in tqdm(list(get_table_names())): table = fetch_table(table_name) table = table.sort_values([col for col in ("key", "date") if col in table.columns]) export_csv(table, path=SRC / "test" / "data" / f"{table_name}.csv", schema=schema)