Пример #1
0
def test_dag_elim():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
    })
    ops = (descr(d=d).extend({
        'y': 'x + 1'
    }).natural_join(
        b=(descr(d=d).extend({'y': 'x + 1'})),
        by=['x'],
        jointype='left',
    ))
    db_model = data_algebra.SQLite.SQLiteModel()
    sql = db_model.to_sql(
        ops,
        sql_format_options=data_algebra.db_model.SQLFormatOptions(
            use_with=True, annotate=False))
    n_d = sql.count('"d"')
    assert n_d > 0
    assert n_d <= 2
    # assert n_d == 1  # show table is referenced exactly once
    expect = pd.DataFrame({
        'x': [1, 2, 3],
        'y': [2, 3, 4],
    })
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #2
0
def test_dag_elim_btt():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
    })
    ops = (descr(d=d).natural_join(
        b=descr(d=d),
        by=['x'],
        jointype='left',
    ))
    sqlite_model = data_algebra.SQLite.SQLiteModel()
    with pytest.raises(ValueError):
        sqlite_model.to_sql(ops)
Пример #3
0
def test_dag_elim_ute():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
        'y': [1, 2, 3],
    })
    ops = (descr(d=d).concat_rows(b=(descr(d=d).extend({'y': 'x'})), ))
    expect = pd.DataFrame({
        'x': [1, 2, 3, 1, 2, 3],
        'y': [1, 2, 3, 1, 2, 3],
        'source_name': ['a', 'a', 'a', 'b', 'b', 'b'],
    })
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #4
0
def test_dag_elim_bte():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
    })
    ops = (descr(d=d).natural_join(
        b=(descr(d=d).extend({'y': 'x'})),
        by=['x'],
        jointype='left',
    ))
    expect = pd.DataFrame({
        'x': [1, 2, 3],
        'y': [1, 2, 3],
    })
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #5
0
def test_dag_elim_bttf():
    # test work around or fix
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
    })
    ops = (
        descr(d=d).natural_join(
            b=descr(d=d).extend({'waste_column': 1}),
            by=['x'],
            jointype='left',
        )
        # can't drop or optimizer gets rid of waste extend
    )
    expect = pd.DataFrame({'x': [1, 2, 3], 'waste_column': 1})
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #6
0
def test_shift():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3, 4, 5],
        'v': ['a', 'b', 'c', 'd', 'e'],
    })
    ops = (descr(d=d).extend(
        {
            'v_m2': 'v.shift(-2)',
            'v_m1': 'v.shift(-1)',
            'v_s': 'v.shift()',
            'v_p1': 'v.shift(1)',
            'v_p2': 'v.shift(2)',
        },
        order_by=['x']))
    res = ops.transform(d)
    expect = pd.DataFrame({
        'x': [1, 2, 3, 4, 5],
        'v': ['a', 'b', 'c', 'd', 'e'],
        'v_m2': ['c', 'd', 'e', None, None],
        'v_m1': ['b', 'c', 'd', 'e', None],
        'v_s': [None, 'a', 'b', 'c', 'd'],
        'v_p1': [None, 'a', 'b', 'c', 'd'],
        'v_p2': [None, None, 'a', 'b', 'c'],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
Пример #7
0
def test_first_last_extend():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'v': ['a', 'b', 'c', 'd'],
        'g': ['o', 'o', 't', 't']
    })
    ops = (descr(d=d).extend({
        'v_first': 'v.first()',
        'v_last': 'v.last()',
    },
                             partition_by=['g'],
                             order_by=['x']))
    res = ops.transform(d)
    expect = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'v': ['a', 'b', 'c', 'd'],
        'g': ['o', 'o', 't', 't'],
        'v_first': ['a', 'a', 'c', 'c'],
        'v_last': ['b', 'b', 'd', 'd'],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(
        ops=ops,
        data=d,
        expect=expect,
        models_to_skip={
            data_algebra.SQLite.SQLiteModel(),
            data_algebra.PostgreSQL.PostgreSQLModel(),
            data_algebra.BigQuery.BigQueryModel(),
            data_algebra.MySQL.MySQLModel(),
            data_algebra.SparkSQL.SparkSQLModel(
            ),  # cumulative implementation (nice but not same as python)
        })
Пример #8
0
def test_float_divide_works():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'a': [1, 1, 1, 0],
        'b': [1, 2, 0, 0],
    })
    ops = (
        descr(d=d).extend({
            'r': 'a %/% b'
        }).extend({  # sqlite returns nan for 1/0, not +inf
            'r': f'r.is_bad().where(None, r)'
        }))
    res_pandas = ops.transform(d)
    expect = pd.DataFrame({
        'a': [1, 1, 1, 0],
        'b': [1, 2, 0, 0],
        'r': [1.0, 0.5, None, None],
    })
    assert data_algebra.test_util.equivalent_frames(expect, res_pandas)
    sqlite_handle = data_algebra.SQLite.example_handle()
    sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True)
    res_sqlite = sqlite_handle.read_query(ops)
    sqlite_handle.close()
    assert data_algebra.test_util.equivalent_frames(expect, res_sqlite)
    data_algebra.test_util.check_transform(
        ops=ops,
        data=d,
        expect=expect,
        models_to_skip={
            data_algebra.MySQL.MySQLModel(),  # sqlalchemy won't insert inf
            data_algebra.SparkSQL.SparkSQLModel(
            ),  # probably not inserting values
        })
Пример #9
0
def test_is_inf():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'a': [1.0, numpy.inf, numpy.nan, None, 0.0, -1.0, -numpy.inf],
    })
    ops = (descr(d=d).extend({
        'is_inf': 'a.is_inf().if_else(1, 0)',
        'is_nan': 'a.is_nan().if_else(1, 0)',
        'is_bad': 'a.is_bad().if_else(1, 0)',
        'is_null': 'a.is_null().if_else(1, 0)',
    }))
    res_pandas = ops.transform(d)
    expect = pd.DataFrame({
        'a': [1.0, numpy.inf, numpy.nan, None, 0.0, -1.0, -numpy.inf],
        'is_inf': [0, 1, 0, 0, 0, 0, 1],
        'is_nan': [0, 0, 1, 1, 0, 0, 0],
        'is_bad': [0, 1, 1, 1, 0, 0, 1],
        'is_null': [0, 0, 1, 1, 0, 0, 0],
    })
    assert data_algebra.test_util.equivalent_frames(expect, res_pandas)
    sqlite_handle = data_algebra.SQLite.example_handle()
    sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True)
    res_sqlite = sqlite_handle.read_query(ops)
    sqlite_handle.close()
    assert data_algebra.test_util.equivalent_frames(expect, res_sqlite)
    data_algebra.test_util.check_transform(
        ops=ops,
        data=d,
        expect=expect,
        models_to_skip={
            data_algebra.MySQL.MySQLModel(),  # sqlalchemy won't insert inf
            data_algebra.SparkSQL.SparkSQLModel(
            ),  # probably not inserting values
        })
Пример #10
0
def test_if_else_return_type():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [True, False, None],
    })
    ops = (
        descr(d=d)
            .extend({
                'w': 'x.where(1.0, 2.0)',
                'i': 'x.if_else(1.0, 2.0)',
            })
    )
    res = ops.transform(d)
    expect = pd.DataFrame({
        'x': [True, False, None],
        'w': [1.0, 2.0, 2.0],
        'i': [1.0, 2.0, numpy.nan],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    assert str(res['w'].dtype) == 'float64'
    assert str(res['i'].dtype) == 'float64'
    numpy.isnan(res.loc[:, ['w', 'i']])  # when column types are wrong this threw in pyvteat test_KDD2009.py
    sqlite_handle = data_algebra.SQLite.example_handle()
    sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True)
    res_sqlite = sqlite_handle.read_query(ops)
    sqlite_handle.close()
    assert data_algebra.test_util.equivalent_frames(res_sqlite, expect)
    assert str(res_sqlite['w'].dtype) == 'float64'
    assert str(res_sqlite['i'].dtype) == 'float64'
    numpy.isnan(res_sqlite.loc[:, ['w', 'i']])  # when column types are wrong this threw in pyvteat test_KDD2009.py
Пример #11
0
def test_dag_elim_utt():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
        'y': [1, 2, 3],
    })
    ops = (descr(d=d).concat_rows(b=descr(d=d), ))
    db_model = data_algebra.SQLite.SQLiteModel()
    sql = db_model.to_sql(
        ops,
        sql_format_options=data_algebra.db_model.SQLFormatOptions(
            use_with=True, annotate=False))
    expect = pd.DataFrame({
        'x': [1, 2, 3, 1, 2, 3],
        'y': [1, 2, 3, 1, 2, 3],
        'source_name': ['a', 'a', 'a', 'b', 'b', 'b'],
    })
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #12
0
def xicor_score_variables_plan(
        d: ViewRepresentation,
        *,
        x_vars: Iterable[str],
        y_name: str,
        n_rep: int = 25,
):
    """
    Set up a query to batch compute xicor.

    :param d: description of incoming data frame
    :param x_vars: list of explanatory variable names
    :param y_name: name of dependent variable
    :param n_rep: number of times to repeat calculation
    :return: group_calc_ops, rep_frame_name, rep_frame
    """
    assert not isinstance(x_vars, str)
    x_vars = list(x_vars)
    assert len(x_vars) > 0
    assert numpy.all([isinstance(c, str) for c in x_vars])
    assert len(x_vars) == len(set(x_vars))
    assert isinstance(y_name, str)
    assert y_name not in x_vars
    d_col_set = set(d.column_names)
    assert y_name in d_col_set
    assert numpy.all([c in d_col_set for c in x_vars])
    assert isinstance(n_rep, int)
    record_map = RecordMap(
        blocks_out=RecordSpecification(
            control_table=data_algebra.pandas_model.pd.DataFrame({
                'variable_name': x_vars,
                'x': x_vars,
                'y': y_name,

            }),
            record_keys=[],
            control_table_keys=['variable_name'])
    )
    rep_frame = data_algebra.default_data_model.pd.DataFrame({'rep': range(n_rep)})
    grouped_calc = (
        xicor_query(
            d
                .convert_records(record_map)
                .natural_join(  # cross join rows to get experiment repetitions
                        b=descr(rep_frame=rep_frame),
                        by=[],
                        jointype='cross',
                    ),
                var_keys=['variable_name', 'rep'])
            .project({
                    'xicor_mean': 'xicor.mean()',
                    'xicor_std': 'xicor.std()',
                },
                group_by=['variable_name'])
            .order_rows(['variable_name'])
    )
    return grouped_calc, 'rep_frame', rep_frame
Пример #13
0
def test_parens_select_rows():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({'x': [1, 2, 3, 4]})
    ops = (descr(d=d).select_rows('x > 1 & x < 4'))
    sql = data_algebra.SQLite.SQLiteModel().to_sql(ops)
    smushed_sql = re.sub(r'\s+', '', sql)
    assert '("x">1)AND("x"<4)' in smushed_sql
    expect = pd.DataFrame({
        'x': [2, 3],
    })
    data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
Пример #14
0
def test_shift_assert_on_0():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3, 4, 5],
        'v': ['a', 'b', 'c', 'd', 'e'],
    })
    with pytest.raises(ValueError):
        # shift by zero not allowed
        ops = (descr(d=d).extend({
            'v_s0': 'v.shift(0)',
        }, order_by=['x']))
Пример #15
0
def test_parens_extend():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({'x': [1, 2, 3, 4]})
    ops = (descr(d=d).extend({'y': 'x > 1 & x < 4'}))
    sql = data_algebra.SQLite.SQLiteModel().to_sql(
        ops, sql_format_options=SQLFormatOptions(annotate=False))
    smushed_sql = re.sub(r'\s+', '', sql)
    assert '("x">1)AND("x"<4)' in smushed_sql
    expect = pd.DataFrame({
        'x': [1, 2, 3, 4],
        'y': [False, True, True, False],
    })
    data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
Пример #16
0
def test_float_divide_needed():
    # need %/% as databases often use integer arithmetic
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({'a': [1, 1, 1], 'b': [1, 2, 0]})
    ops = (descr(d=d).extend({
        'r1': 'a/b',
    }))
    res_pandas = ops.transform(d)
    sqlite_handle = data_algebra.SQLite.example_handle()
    sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True)
    res_sqlite = sqlite_handle.read_query(ops)
    sqlite_handle.close()
    assert not data_algebra.test_util.equivalent_frames(res_sqlite, res_pandas)
Пример #17
0
def test_braid():
    pd = data_algebra.default_data_model.pd
    d_state = pd.DataFrame({
        't': [1, 3, 5],
        'state': ['a', 'b', 'c'],
    })
    d_event = pd.DataFrame({
        't': [1, 4],
        'value': [10, 20],
    })
    ops = data_algebra.solutions.braid_data(
        d_state=descr(d_state=d_state),
        d_event=descr(d_event=d_event),
        order_by=['t'],
        partition_by=[],
        state_value_column_name='state',
        event_value_column_names=['value'],
        stand_in_values={'state': '""', 'value': 0.0},
    ).order_rows(['t', 'record_type', 'state'], reverse=['record_type'])
    res = ops.eval({'d_state': d_state, 'd_event': d_event})
    # print(data_algebra.util.pandas_to_example_str(res))
    expect = pd.DataFrame({
        't': [1, 1, 3, 4, 5],
        'state': ['a', 'a', 'b', 'b', 'c'],
        'value': [None, 10, None, 20, None],
        'record_type': ['state_row', 'event_row', 'state_row', 'event_row', 'state_row'],
        })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(
        ops=ops,
        data={'d_state': d_state, 'd_event': d_event},
        expect=expect,
        models_to_skip={  # TODO: vet on more dbs
            data_algebra.SparkSQL.SparkSQLModel(),
            data_algebra.MySQL.MySQLModel(),
            data_algebra.PostgreSQL.PostgreSQLModel(),
            data_algebra.BigQuery.BigQueryModel(),
        }
    )
Пример #18
0
def test_dag_elim_bttb():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 3],
    })
    ops = (descr(d=d).natural_join(
        b=descr(d2=d),
        by=['x'],
        jointype='left',
    ))
    db_model = data_algebra.SQLite.SQLiteModel()
    sql = db_model.to_sql(
        ops,
        sql_format_options=data_algebra.db_model.SQLFormatOptions(
            use_with=False, annotate=False))
    expect = pd.DataFrame({
        'x': [1, 2, 3],
    })
    data_algebra.test_util.check_transform(ops=ops,
                                           data={
                                               'd': d,
                                               'd2': d
                                           },
                                           expect=expect)
Пример #19
0
def test_sum_one():
    d = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'a', 'b', 'b', 'b'],
        'one': [1, 1, 1, 1, 1],
    })
    ops = (descr(d=d).project({
        'sum_one': 'one.sum()',
        'sum_1': '(1).sum()',
    },
                              group_by=['group']))
    expect = data_algebra.default_data_model.pd.DataFrame({
        'group': ['a', 'b'],
        'sum_one': [2, 3],
        'sum_1': [2, 3],
    })
    data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
Пример #20
0
def test_db_handle_services_to_csv():
    pd = data_algebra.default_data_model.pd
    db_handles = data_algebra.test_util.get_test_dbs()
    d = pd.DataFrame({'x': [1.0, 2.0], 'y': ['a', 'b']})
    td = descr(d=d)
    for h in db_handles:
        h.insert_table(d, table_name='d', allow_overwrite=True)
        h.drop_table('d')
        h.insert_table(d, table_name='d', allow_overwrite=False)
        with pytest.raises(ValueError):
            h.insert_table(d, table_name='d', allow_overwrite=False)
        with tempfile.NamedTemporaryFile(suffix='.csv', delete=True) as tmp:
            h.query_to_csv(td, res_name=tmp.name)
            d_back = pd.read_csv(tmp.name)
            assert data_algebra.test_util.equivalent_frames(d, d_back)
        h.drop_table('d')
    # clean up
    for h in db_handles:
        h.close()
Пример #21
0
def test_db_handle_servies_bq_desribe():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({'x': [1.0, 2.0], 'y': ['a', 'b']})
    td = descr(d=d)
    if data_algebra.test_util.test_BigQuery:
        h = data_algebra.BigQuery.example_handle()
        tdb = h.insert_table(d, table_name='d', allow_overwrite=True)
        d_back = h.read_query(td)
        data_algebra.test_util.equivalent_frames(d, d_back)
        data_catalog, data_schema = h.db_model.table_prefix.split('.')
        bqd = h.describe_bq_table(
            table_catalog=data_catalog,
            table_schema=data_schema,
            table_name='d',
            row_limit=7
            )
        assert isinstance(bqd, TableDescription)
        assert set(bqd.column_names) == set(td.column_names)
        assert data_algebra.test_util.equivalent_frames(bqd.head, d)
        h.close()
Пример #22
0
def test_any_project_value():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [.1, .1, .3, .4],
        'g': ['a', 'a', 'b', 'ccc'],
    })
    ops = (descr(d=d).project({
        'new_column': 'x.any_value()'
    }, group_by=['g']).order_rows(['g']))
    res = ops.transform(d)
    expect = pd.DataFrame({
        'g': ['a', 'b', 'ccc'],
        'new_column': [0.1, 0.3, 0.4],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(ops=ops,
                                           data=d,
                                           expect=expect,
                                           valid_for_empty=False)
    bq_sql = data_algebra.BigQuery.BigQueryModel().to_sql(ops)
    assert 'ANY_VALUE(`x`)' in bq_sql
Пример #23
0
def test_any_project_logical():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'a': [False, False],
        'b': [False, True],
        'c': [True, False],
        'd': [True, True],
    })
    ops = (descr(d=d).project(
        {
            'any_a': 'a.any()',
            'all_a': 'a.all()',
            'any_b': 'b.any()',
            'all_b': 'b.all()',
            'any_c': 'c.any()',
            'all_c': 'c.all()',
            'any_d': 'd.any()',
            'all_d': 'd.all()',
        },
        group_by=[]))
    res = ops.transform(d)
    expect = pd.DataFrame({
        'any_a': [False],
        'all_a': [False],
        'any_b': [True],
        'all_b': [False],
        'any_c': [True],
        'all_c': [False],
        'any_d': [True],
        'all_d': [True],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(ops=ops,
                                           data=d,
                                           expect=expect,
                                           valid_for_empty=False)
Пример #24
0
def test_rank_1():
    pd = data_algebra.default_data_model.pd
    d = pd.DataFrame({
        'x': [1, 2, 1],
    })
    ops = (descr(d=d).extend({'x_rank': 'x.rank()'}))
    res = ops.transform(d)
    expect = pd.DataFrame({
        'x': [1, 2, 1],
        'x_rank': [1.5, 3.0, 1.5],
    })
    assert data_algebra.test_util.equivalent_frames(res, expect)
    data_algebra.test_util.check_transform(
        ops=ops,
        data=d,
        expect=expect,
        models_to_skip={
            data_algebra.SQLite.SQLiteModel(),
            data_algebra.PostgreSQL.PostgreSQLModel(),
            data_algebra.BigQuery.BigQueryModel(),
            data_algebra.MySQL.MySQLModel(),
            data_algebra.SparkSQL.SparkSQLModel(
            ),  # cumulative implementation (nice but not same as python)
        })