def test_read_pandas(with_initial_test_data): dolt, first_commit = with_initial_test_data second_commit = update_test_data(dolt) first_write = read_pandas(dolt, TEST_TABLE, first_commit) compare_rows_helper(first_write.to_dict('records'), TEST_DATA_INITIAL) second_write = read_pandas(dolt, TEST_TABLE, second_commit) compare_rows_helper(second_write.to_dict('records'), TEST_DATA_COMBINED)
def test_dataframe_table_loader_update(update_test_data): repo = update_test_data womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) assert 'Rafael' in list(mens_data['name'])
def test_dataframe_table_loader_create(initial_test_data): repo = initial_test_data womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert womens_data.iloc[0]['name'] == 'Serena' assert mens_data.iloc[0]['name'] == 'Roger'
def test_get_unique_key_update_writer(init_empty_test_repo): repo = init_empty_test_repo def generate_initial_data(): return pd.DataFrame([{ 'name': 'Roger', 'id': 1 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }]) test_table = 'test_data' get_dolt_loader([ get_unique_key_table_writer( test_table, generate_initial_data, import_mode='create') ], True, 'Create test data')(repo) # Test that we have what we expect data = read_pandas(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1 for player in ['Roger', 'Novak'] ] assert data.loc[data['name'] == 'Rafael', 'count'].astype(float).iloc[0] == 2 def generate_updated_data(): return pd.DataFrame([{ 'name': 'Rafael', 'id': 2 }, { 'name': 'Novak', 'id': 3 }, { 'name': 'Andy', 'id': 4 }]) get_dolt_loader( [get_unique_key_table_writer(test_table, generate_updated_data)], True, 'Updating data')(repo) data = read_pandas(repo, test_table) assert [ data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1 for player in ['Rafael', 'Novak', 'Andy'] ]
def test_table_transformer_update(update_derived_data): repo = update_derived_data avg_df = read_pandas(repo, AVERAGE_MAJOR_COUNT) assert avg_df.loc[avg_df['gender'] == 'mens', 'average'].astype(float).iloc[0] == (20 + 19) / 2 assert avg_df.loc[avg_df['gender'] == 'womens', 'average'].astype(float).iloc[0] == (23 + 24) / 2
def test_table_transformer_create(initial_derived_data): repo = initial_derived_data avg_df = read_pandas(repo, AVERAGE_MAJOR_COUNT) assert avg_df.loc[avg_df['gender'] == 'mens', 'average'].astype(float).iloc[0] == 20 assert avg_df.loc[avg_df['gender'] == 'womens', 'average'].astype(float).iloc[0] == 23
def test_sql(create_test_table): repo, test_table = create_test_table sql = ''' INSERT INTO {table} (name, id) VALUES ('Roger', 3) '''.format(table=test_table) repo.sql(query=sql) test_data = read_pandas(repo, test_table) assert 'Roger' in test_data['name'].to_list()
def test_branching(initial_test_data): repo = initial_test_data test_branch = 'new-branch' repo.branch(branch_name=test_branch) repo.checkout(test_branch) _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch) current_branch, _ = repo.branch() assert current_branch.name == test_branch womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) assert 'Rafael' in list(mens_data['name']) repo.checkout('main') womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' not in list(womens_data['name']) assert 'Rafael' not in list(mens_data['name'])
def test_multi_branch_load(initial_test_data): repo = initial_test_data first_branch, second_branch = 'first-branch', 'second-branch' _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, first_branch) _populate_test_data_helper(repo, SECOND_UPDATE_MENS, SECOND_UPDATE_WOMENS, second_branch) womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' not in list( womens_data['name']) and 'Rafael' not in list(mens_data['name']) assert 'Steffi' not in list(womens_data['name']) and 'Novak' not in list( mens_data['name']) repo.checkout(first_branch) womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list( mens_data['name']) repo.checkout(second_branch) womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Steffi' in list(womens_data['name']) and 'Novak' in list( mens_data['name'])
def test_load_to_dolt_new_branch(initial_test_data): repo = initial_test_data test_branch = 'new-branch' # check we have only the expected branches in the sample data _, branches = repo.branch() assert [b.name for b in branches] == ['main'] # load some data to a new branch _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch) # check that we are still on the branch we started on current_branch, current_branches = repo.branch() assert current_branch.name == 'main' and [ b.name for b in current_branches ] == ['main', test_branch] # check out our new branch and confirm our data is present repo.checkout(test_branch) womens_data, mens_data = read_pandas(repo, WOMENS_MAJOR_COUNT), read_pandas( repo, MENS_MAJOR_COUNT) assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list( mens_data['name'])
def test_insert_unique_key(init_empty_test_repo): repo = init_empty_test_repo def generate_data(): return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']}) test_table = 'test_data' get_dolt_loader([ get_df_table_writer(test_table, generate_data, ['hash_id'], transformers=[insert_unique_key]) ], True, 'Updating test data')(repo) result = read_pandas(repo, test_table) assert result.loc[result['id'] == 1, 'count'].iloc[0] == 2 and 'hash_id' in result.columns
def test_get_bulk_table_loader(init_empty_test_repo): repo = init_empty_test_repo table = 'test_table' def get_data(): return io.StringIO(CORRUPT_CSV) def cleaner(data: io.StringIO) -> io.StringIO: output = io.StringIO() header_line = data.readline() columns = header_line.split(',') output.write(header_line) for l in data.readlines(): if len(l.split(',')) != len(columns): print('Corrupt line, discarding:\n{}'.format(l)) else: output.write(l) output.seek(0) return output get_bulk_table_writer(table, get_data, ['player_name'], import_mode=CREATE, transformers=[cleaner])(repo) actual = read_pandas(repo, table) expected = io.StringIO(CLEANED_CSV) headers = [col.rstrip() for col in expected.readline().split(',')] assert all(headers == actual.columns) players_to_week_counts = actual.set_index( 'player_name')['weeks_at_number_1'].to_dict() for line in expected.readlines(): player_name, weeks_at_number_1 = line.split(',') assert (player_name in players_to_week_counts and players_to_week_counts[player_name] == int( weeks_at_number_1.rstrip()))
"short_description like '%%transfusion%%') or (long_description like '%%blood%%' and " \ "long_description like '%%transfusion%%'); " def chunker(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size)) def insert_with_progress(df, table_name): chunksize = int(len(df) / 10) with tqdm(total=len(df)) as pbar: for i in range(10): pos = chunksize * i cdf = df.iloc[pos:pos + chunksize, :] cdf.to_sql(name=table_name, con=engine, if_exists="append", index=False) pbar.update(chunksize) tqdm._instances.clear() cpt_hcpcs_df = read_pandas(repo, "cpt_hcpcs") insert_with_progress(cpt_hcpcs_df, "cpt_hcpcs") hospitals_df = read_pandas(repo, "hospitals") insert_with_progress(hospitals_df, "hospitals") prices_df = read_pandas(repo, "prices") insert_with_progress(prices_df, "prices")
"long_description like '%%transfusion%%'); " def chunker(seq, size): return (seq[pos:pos + size] for pos in range(0, len(seq), size)) def insert_with_progress(df, table_name): chunksize = int(len(df) / 10) with tqdm(total=len(df)) as pbar: for i in range(10): pos = chunksize * i cdf = df.iloc[pos:pos + chunksize, :] # for i, cdf in enumerate(chunker(df, chunksize)): cdf.to_sql(name=table_name, con=engine, if_exists="append", index=False) pbar.update(chunksize) tqdm._instances.clear() # cpt_hcpcs_df = read_pandas(repo, "cpt_hcpcs") # insert_with_progress(cpt_hcpcs_df, "cpt_hcpcs") # hospitals_df = read_pandas(repo, "hospitals") # insert_with_progress(hospitals_df, "hospitals") prices_df = read_pandas(repo, "prices") insert_with_progress(prices_df, "prices")
def get_raw_data(repo: Dolt): return pd.concat([ read_pandas(repo, MENS_MAJOR_COUNT).assign(gender='mens'), read_pandas(repo, WOMENS_MAJOR_COUNT).assign(gender='womens') ])