Пример #1
0
def test_read_pandas(with_initial_test_data):
    dolt, first_commit = with_initial_test_data
    second_commit = update_test_data(dolt)
    first_write = read_pandas(dolt, TEST_TABLE, first_commit)
    compare_rows_helper(first_write.to_dict('records'), TEST_DATA_INITIAL)
    second_write = read_pandas(dolt, TEST_TABLE, second_commit)
    compare_rows_helper(second_write.to_dict('records'), TEST_DATA_COMBINED)
Пример #2
0
def test_dataframe_table_loader_update(update_test_data):
    repo = update_test_data

    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name'])
    assert 'Rafael' in list(mens_data['name'])
Пример #3
0
def test_dataframe_table_loader_create(initial_test_data):
    repo = initial_test_data

    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert womens_data.iloc[0]['name'] == 'Serena'
    assert mens_data.iloc[0]['name'] == 'Roger'
Пример #4
0
def test_get_unique_key_update_writer(init_empty_test_repo):
    repo = init_empty_test_repo

    def generate_initial_data():
        return pd.DataFrame([{
            'name': 'Roger',
            'id': 1
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }])

    test_table = 'test_data'
    get_dolt_loader([
        get_unique_key_table_writer(
            test_table, generate_initial_data, import_mode='create')
    ], True, 'Create test data')(repo)

    # Test that we have what we expect
    data = read_pandas(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1
        for player in ['Roger', 'Novak']
    ]
    assert data.loc[data['name'] == 'Rafael',
                    'count'].astype(float).iloc[0] == 2

    def generate_updated_data():
        return pd.DataFrame([{
            'name': 'Rafael',
            'id': 2
        }, {
            'name': 'Novak',
            'id': 3
        }, {
            'name': 'Andy',
            'id': 4
        }])

    get_dolt_loader(
        [get_unique_key_table_writer(test_table, generate_updated_data)], True,
        'Updating data')(repo)
    data = read_pandas(repo, test_table)
    assert [
        data.loc[data['name'] == player, 'count'].astype(float).iloc[0] == 1
        for player in ['Rafael', 'Novak', 'Andy']
    ]
Пример #5
0
def test_table_transformer_update(update_derived_data):
    repo = update_derived_data
    avg_df = read_pandas(repo, AVERAGE_MAJOR_COUNT)
    assert avg_df.loc[avg_df['gender'] == 'mens',
                      'average'].astype(float).iloc[0] == (20 + 19) / 2
    assert avg_df.loc[avg_df['gender'] == 'womens',
                      'average'].astype(float).iloc[0] == (23 + 24) / 2
Пример #6
0
def test_table_transformer_create(initial_derived_data):
    repo = initial_derived_data
    avg_df = read_pandas(repo, AVERAGE_MAJOR_COUNT)
    assert avg_df.loc[avg_df['gender'] == 'mens',
                      'average'].astype(float).iloc[0] == 20
    assert avg_df.loc[avg_df['gender'] == 'womens',
                      'average'].astype(float).iloc[0] == 23
Пример #7
0
def test_sql(create_test_table):
    repo, test_table = create_test_table
    sql = '''
        INSERT INTO {table} (name, id)
        VALUES ('Roger', 3)
    '''.format(table=test_table)
    repo.sql(query=sql)

    test_data = read_pandas(repo, test_table)
    assert 'Roger' in test_data['name'].to_list()
Пример #8
0
def test_branching(initial_test_data):
    repo = initial_test_data
    test_branch = 'new-branch'
    repo.branch(branch_name=test_branch)
    repo.checkout(test_branch)
    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch)

    current_branch, _ = repo.branch()
    assert current_branch.name == test_branch
    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name'])
    assert 'Rafael' in list(mens_data['name'])

    repo.checkout('main')
    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' not in list(womens_data['name'])
    assert 'Rafael' not in list(mens_data['name'])
Пример #9
0
def test_multi_branch_load(initial_test_data):
    repo = initial_test_data
    first_branch, second_branch = 'first-branch', 'second-branch'

    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, first_branch)
    _populate_test_data_helper(repo, SECOND_UPDATE_MENS, SECOND_UPDATE_WOMENS,
                               second_branch)

    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' not in list(
        womens_data['name']) and 'Rafael' not in list(mens_data['name'])
    assert 'Steffi' not in list(womens_data['name']) and 'Novak' not in list(
        mens_data['name'])

    repo.checkout(first_branch)
    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list(
        mens_data['name'])

    repo.checkout(second_branch)
    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Steffi' in list(womens_data['name']) and 'Novak' in list(
        mens_data['name'])
Пример #10
0
def test_load_to_dolt_new_branch(initial_test_data):
    repo = initial_test_data
    test_branch = 'new-branch'

    # check we have only the expected branches in the sample data
    _, branches = repo.branch()
    assert [b.name for b in branches] == ['main']

    # load some data to a new branch
    _populate_test_data_helper(repo, UPDATE_MENS, UPDATE_WOMENS, test_branch)

    # check that we are still on the branch we started on
    current_branch, current_branches = repo.branch()
    assert current_branch.name == 'main' and [
        b.name for b in current_branches
    ] == ['main', test_branch]

    # check out our new branch and confirm our data is present
    repo.checkout(test_branch)
    womens_data, mens_data = read_pandas(repo,
                                         WOMENS_MAJOR_COUNT), read_pandas(
                                             repo, MENS_MAJOR_COUNT)
    assert 'Margaret' in list(womens_data['name']) and 'Rafael' in list(
        mens_data['name'])
Пример #11
0
def test_insert_unique_key(init_empty_test_repo):
    repo = init_empty_test_repo

    def generate_data():
        return pd.DataFrame({'id': [1, 1, 2], 'value': ['foo', 'foo', 'baz']})

    test_table = 'test_data'
    get_dolt_loader([
        get_df_table_writer(test_table,
                            generate_data, ['hash_id'],
                            transformers=[insert_unique_key])
    ], True, 'Updating test data')(repo)
    result = read_pandas(repo, test_table)
    assert result.loc[result['id'] == 1,
                      'count'].iloc[0] == 2 and 'hash_id' in result.columns
Пример #12
0
def test_get_bulk_table_loader(init_empty_test_repo):
    repo = init_empty_test_repo
    table = 'test_table'

    def get_data():
        return io.StringIO(CORRUPT_CSV)

    def cleaner(data: io.StringIO) -> io.StringIO:
        output = io.StringIO()
        header_line = data.readline()
        columns = header_line.split(',')
        output.write(header_line)
        for l in data.readlines():
            if len(l.split(',')) != len(columns):
                print('Corrupt line, discarding:\n{}'.format(l))
            else:
                output.write(l)

        output.seek(0)
        return output

    get_bulk_table_writer(table,
                          get_data, ['player_name'],
                          import_mode=CREATE,
                          transformers=[cleaner])(repo)
    actual = read_pandas(repo, table)
    expected = io.StringIO(CLEANED_CSV)
    headers = [col.rstrip() for col in expected.readline().split(',')]
    assert all(headers == actual.columns)
    players_to_week_counts = actual.set_index(
        'player_name')['weeks_at_number_1'].to_dict()
    for line in expected.readlines():
        player_name, weeks_at_number_1 = line.split(',')
        assert (player_name in players_to_week_counts
                and players_to_week_counts[player_name] == int(
                    weeks_at_number_1.rstrip()))
Пример #13
0
        "short_description like '%%transfusion%%')         or (long_description like '%%blood%%' and             " \
        "long_description like '%%transfusion%%'); "


def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


def insert_with_progress(df, table_name):
    chunksize = int(len(df) / 10)
    with tqdm(total=len(df)) as pbar:
        for i in range(10):
            pos = chunksize * i
            cdf = df.iloc[pos:pos + chunksize, :]
            cdf.to_sql(name=table_name,
                       con=engine,
                       if_exists="append",
                       index=False)
            pbar.update(chunksize)
            tqdm._instances.clear()


cpt_hcpcs_df = read_pandas(repo, "cpt_hcpcs")
insert_with_progress(cpt_hcpcs_df, "cpt_hcpcs")

hospitals_df = read_pandas(repo, "hospitals")
insert_with_progress(hospitals_df, "hospitals")

prices_df = read_pandas(repo, "prices")
insert_with_progress(prices_df, "prices")
Пример #14
0
        "long_description like '%%transfusion%%'); "


def chunker(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))


def insert_with_progress(df, table_name):
    chunksize = int(len(df) / 10)
    with tqdm(total=len(df)) as pbar:
        for i in range(10):
            pos = chunksize * i
            cdf = df.iloc[pos:pos + chunksize, :]
            # for i, cdf in enumerate(chunker(df, chunksize)):
            cdf.to_sql(name=table_name,
                       con=engine,
                       if_exists="append",
                       index=False)
            pbar.update(chunksize)
            tqdm._instances.clear()


# cpt_hcpcs_df = read_pandas(repo, "cpt_hcpcs")
# insert_with_progress(cpt_hcpcs_df, "cpt_hcpcs")

# hospitals_df = read_pandas(repo, "hospitals")
# insert_with_progress(hospitals_df, "hospitals")

prices_df = read_pandas(repo, "prices")
insert_with_progress(prices_df, "prices")
Пример #15
0
def get_raw_data(repo: Dolt):
    return pd.concat([
        read_pandas(repo, MENS_MAJOR_COUNT).assign(gender='mens'),
        read_pandas(repo, WOMENS_MAJOR_COUNT).assign(gender='womens')
    ])