def test_refactor_dataframes(): df = pd.DataFrame([{ 'name': 'Terry', 'score': 0.5, }, { 'name': 'Terry', 'score': 0.8, }, { 'name': 'Owen', 'score': 0.7, }]) conn = sqlite3.connect(':memory:') output = utils.refactor_dataframes(conn, [df], {'name': ('People', 'first_name')}) assert 1 == len(output) dataframe = output[0] # There should be a 'People' table in sqlite assert [ (1, 'Terry'), (2, 'Owen'), ] == conn.execute('select id, first_name from People').fetchall() assert (' name score\n' '0 1 0.5\n' '1 1 0.8\n' '2 2 0.7') == str(dataframe)
def test_refactor_dataframes(): df = pd.DataFrame([ { "name": "Terry", "score": 0.5 }, { "name": "Terry", "score": 0.8 }, { "name": "Owen", "score": 0.7 }, ]) conn = sqlite3.connect(":memory:") output = utils.refactor_dataframes(conn, [df], {"name": ("People", "first_name")}, False) assert 1 == len(output) dataframe = output[0] # There should be a 'People' table in sqlite assert [(1, "Terry"), (2, "Owen") ] == conn.execute("select id, first_name from People").fetchall() assert (" name score\n" "0 1 0.5\n" "1 1 0.8\n" "2 2 0.7") == str(dataframe)
def convert_csv_to_sqlite(csv_filename): sqlite_filename = "%s.sqlite" % os.path.splitext(csv_filename)[0] conn = sqlite3.connect(sqlite_filename) dataframes = [] csvs = {os.path.splitext(csv_filename)[0]: csv_filename} for name, path in csvs.items(): try: df = load_csv(path, separator=",", skip_errors=False, quoting=0, shape=None) df.table_name = "Items" dataframes.append(df) except LoadCsvError as e: print("Could not load {}: {}".format(path, e), file=sys.stderr) print("Loaded {} dataframes".format(len(dataframes))) assert len(dataframes) == 1 # Use extract_columns to build a column:(table,label) dictionary foreign_keys = {} column_map = [ (re.compile(r"Other Title(?:|_\d+)"), "Title"), (re.compile(r"Creator/Publisher(?:|_\d+)"), "Creator"), (re.compile(r"Subject(?:|_\d+)"), "Subject"), (re.compile(r"Collection(?:|_\d+)"), "Collection"), (re.compile(r"Era(?:|_\d+)"), "Era"), (re.compile(r"Location(?:|_\d+)"), "Location"), (re.compile(r"Geography(?:|_\d+)"), "Geography"), ] for column_name in dataframes[0].columns: for regex, mapped in column_map: if regex.search(column_name): foreign_keys[column_name] = (mapped, "value") # Now we have loaded the dataframes, we can refactor them refactored = refactor_dataframes(conn, dataframes, foreign_keys, False) for df in refactored: # This is a bit trickier because we need to # create the table with extra SQL for foreign keys if table_exists(conn, df.table_name): drop_table(conn, df.table_name) to_sql_with_foreign_keys(conn, df, df.table_name, foreign_keys) conn.close() print("Updated", sqlite_filename)
def test_refactor_dataframes(): df = pd.DataFrame([{ 'name': 'Terry', 'score': 0.5, }, { 'name': 'Terry', 'score': 0.8, }, { 'name': 'Owen', 'score': 0.7, }]) output = utils.refactor_dataframes([df], {'name': ('People', 'first_name')}) assert 2 == len(output) lookup_table, dataframe = output assert {1: 'Terry', 2: 'Owen'} == lookup_table.id_to_value assert (' name score\n' '0 1 0.5\n' '1 1 0.8\n' '2 2 0.7') == str(dataframe)