def test_dag_elim(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], }) ops = (descr(d=d).extend({ 'y': 'x + 1' }).natural_join( b=(descr(d=d).extend({'y': 'x + 1'})), by=['x'], jointype='left', )) db_model = data_algebra.SQLite.SQLiteModel() sql = db_model.to_sql( ops, sql_format_options=data_algebra.db_model.SQLFormatOptions( use_with=True, annotate=False)) n_d = sql.count('"d"') assert n_d > 0 assert n_d <= 2 # assert n_d == 1 # show table is referenced exactly once expect = pd.DataFrame({ 'x': [1, 2, 3], 'y': [2, 3, 4], }) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def test_dag_elim_btt(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], }) ops = (descr(d=d).natural_join( b=descr(d=d), by=['x'], jointype='left', )) sqlite_model = data_algebra.SQLite.SQLiteModel() with pytest.raises(ValueError): sqlite_model.to_sql(ops)
def test_dag_elim_ute(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], 'y': [1, 2, 3], }) ops = (descr(d=d).concat_rows(b=(descr(d=d).extend({'y': 'x'})), )) expect = pd.DataFrame({ 'x': [1, 2, 3, 1, 2, 3], 'y': [1, 2, 3, 1, 2, 3], 'source_name': ['a', 'a', 'a', 'b', 'b', 'b'], }) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def test_dag_elim_bte(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], }) ops = (descr(d=d).natural_join( b=(descr(d=d).extend({'y': 'x'})), by=['x'], jointype='left', )) expect = pd.DataFrame({ 'x': [1, 2, 3], 'y': [1, 2, 3], }) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def test_dag_elim_bttf(): # test work around or fix pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], }) ops = ( descr(d=d).natural_join( b=descr(d=d).extend({'waste_column': 1}), by=['x'], jointype='left', ) # can't drop or optimizer gets rid of waste extend ) expect = pd.DataFrame({'x': [1, 2, 3], 'waste_column': 1}) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def test_shift(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3, 4, 5], 'v': ['a', 'b', 'c', 'd', 'e'], }) ops = (descr(d=d).extend( { 'v_m2': 'v.shift(-2)', 'v_m1': 'v.shift(-1)', 'v_s': 'v.shift()', 'v_p1': 'v.shift(1)', 'v_p2': 'v.shift(2)', }, order_by=['x'])) res = ops.transform(d) expect = pd.DataFrame({ 'x': [1, 2, 3, 4, 5], 'v': ['a', 'b', 'c', 'd', 'e'], 'v_m2': ['c', 'd', 'e', None, None], 'v_m1': ['b', 'c', 'd', 'e', None], 'v_s': [None, 'a', 'b', 'c', 'd'], 'v_p1': [None, 'a', 'b', 'c', 'd'], 'v_p2': [None, None, 'a', 'b', 'c'], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
def test_first_last_extend(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3, 4], 'v': ['a', 'b', 'c', 'd'], 'g': ['o', 'o', 't', 't'] }) ops = (descr(d=d).extend({ 'v_first': 'v.first()', 'v_last': 'v.last()', }, partition_by=['g'], order_by=['x'])) res = ops.transform(d) expect = pd.DataFrame({ 'x': [1, 2, 3, 4], 'v': ['a', 'b', 'c', 'd'], 'g': ['o', 'o', 't', 't'], 'v_first': ['a', 'a', 'c', 'c'], 'v_last': ['b', 'b', 'd', 'd'], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform( ops=ops, data=d, expect=expect, models_to_skip={ data_algebra.SQLite.SQLiteModel(), data_algebra.PostgreSQL.PostgreSQLModel(), data_algebra.BigQuery.BigQueryModel(), data_algebra.MySQL.MySQLModel(), data_algebra.SparkSQL.SparkSQLModel( ), # cumulative implementation (nice but not same as python) })
def test_float_divide_works(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'a': [1, 1, 1, 0], 'b': [1, 2, 0, 0], }) ops = ( descr(d=d).extend({ 'r': 'a %/% b' }).extend({ # sqlite returns nan for 1/0, not +inf 'r': f'r.is_bad().where(None, r)' })) res_pandas = ops.transform(d) expect = pd.DataFrame({ 'a': [1, 1, 1, 0], 'b': [1, 2, 0, 0], 'r': [1.0, 0.5, None, None], }) assert data_algebra.test_util.equivalent_frames(expect, res_pandas) sqlite_handle = data_algebra.SQLite.example_handle() sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True) res_sqlite = sqlite_handle.read_query(ops) sqlite_handle.close() assert data_algebra.test_util.equivalent_frames(expect, res_sqlite) data_algebra.test_util.check_transform( ops=ops, data=d, expect=expect, models_to_skip={ data_algebra.MySQL.MySQLModel(), # sqlalchemy won't insert inf data_algebra.SparkSQL.SparkSQLModel( ), # probably not inserting values })
def test_is_inf(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'a': [1.0, numpy.inf, numpy.nan, None, 0.0, -1.0, -numpy.inf], }) ops = (descr(d=d).extend({ 'is_inf': 'a.is_inf().if_else(1, 0)', 'is_nan': 'a.is_nan().if_else(1, 0)', 'is_bad': 'a.is_bad().if_else(1, 0)', 'is_null': 'a.is_null().if_else(1, 0)', })) res_pandas = ops.transform(d) expect = pd.DataFrame({ 'a': [1.0, numpy.inf, numpy.nan, None, 0.0, -1.0, -numpy.inf], 'is_inf': [0, 1, 0, 0, 0, 0, 1], 'is_nan': [0, 0, 1, 1, 0, 0, 0], 'is_bad': [0, 1, 1, 1, 0, 0, 1], 'is_null': [0, 0, 1, 1, 0, 0, 0], }) assert data_algebra.test_util.equivalent_frames(expect, res_pandas) sqlite_handle = data_algebra.SQLite.example_handle() sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True) res_sqlite = sqlite_handle.read_query(ops) sqlite_handle.close() assert data_algebra.test_util.equivalent_frames(expect, res_sqlite) data_algebra.test_util.check_transform( ops=ops, data=d, expect=expect, models_to_skip={ data_algebra.MySQL.MySQLModel(), # sqlalchemy won't insert inf data_algebra.SparkSQL.SparkSQLModel( ), # probably not inserting values })
def test_if_else_return_type(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [True, False, None], }) ops = ( descr(d=d) .extend({ 'w': 'x.where(1.0, 2.0)', 'i': 'x.if_else(1.0, 2.0)', }) ) res = ops.transform(d) expect = pd.DataFrame({ 'x': [True, False, None], 'w': [1.0, 2.0, 2.0], 'i': [1.0, 2.0, numpy.nan], }) assert data_algebra.test_util.equivalent_frames(res, expect) assert str(res['w'].dtype) == 'float64' assert str(res['i'].dtype) == 'float64' numpy.isnan(res.loc[:, ['w', 'i']]) # when column types are wrong this threw in pyvteat test_KDD2009.py sqlite_handle = data_algebra.SQLite.example_handle() sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True) res_sqlite = sqlite_handle.read_query(ops) sqlite_handle.close() assert data_algebra.test_util.equivalent_frames(res_sqlite, expect) assert str(res_sqlite['w'].dtype) == 'float64' assert str(res_sqlite['i'].dtype) == 'float64' numpy.isnan(res_sqlite.loc[:, ['w', 'i']]) # when column types are wrong this threw in pyvteat test_KDD2009.py
def test_dag_elim_utt(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], 'y': [1, 2, 3], }) ops = (descr(d=d).concat_rows(b=descr(d=d), )) db_model = data_algebra.SQLite.SQLiteModel() sql = db_model.to_sql( ops, sql_format_options=data_algebra.db_model.SQLFormatOptions( use_with=True, annotate=False)) expect = pd.DataFrame({ 'x': [1, 2, 3, 1, 2, 3], 'y': [1, 2, 3, 1, 2, 3], 'source_name': ['a', 'a', 'a', 'b', 'b', 'b'], }) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def xicor_score_variables_plan( d: ViewRepresentation, *, x_vars: Iterable[str], y_name: str, n_rep: int = 25, ): """ Set up a query to batch compute xicor. :param d: description of incoming data frame :param x_vars: list of explanatory variable names :param y_name: name of dependent variable :param n_rep: number of times to repeat calculation :return: group_calc_ops, rep_frame_name, rep_frame """ assert not isinstance(x_vars, str) x_vars = list(x_vars) assert len(x_vars) > 0 assert numpy.all([isinstance(c, str) for c in x_vars]) assert len(x_vars) == len(set(x_vars)) assert isinstance(y_name, str) assert y_name not in x_vars d_col_set = set(d.column_names) assert y_name in d_col_set assert numpy.all([c in d_col_set for c in x_vars]) assert isinstance(n_rep, int) record_map = RecordMap( blocks_out=RecordSpecification( control_table=data_algebra.pandas_model.pd.DataFrame({ 'variable_name': x_vars, 'x': x_vars, 'y': y_name, }), record_keys=[], control_table_keys=['variable_name']) ) rep_frame = data_algebra.default_data_model.pd.DataFrame({'rep': range(n_rep)}) grouped_calc = ( xicor_query( d .convert_records(record_map) .natural_join( # cross join rows to get experiment repetitions b=descr(rep_frame=rep_frame), by=[], jointype='cross', ), var_keys=['variable_name', 'rep']) .project({ 'xicor_mean': 'xicor.mean()', 'xicor_std': 'xicor.std()', }, group_by=['variable_name']) .order_rows(['variable_name']) ) return grouped_calc, 'rep_frame', rep_frame
def test_parens_select_rows(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({'x': [1, 2, 3, 4]}) ops = (descr(d=d).select_rows('x > 1 & x < 4')) sql = data_algebra.SQLite.SQLiteModel().to_sql(ops) smushed_sql = re.sub(r'\s+', '', sql) assert '("x">1)AND("x"<4)' in smushed_sql expect = pd.DataFrame({ 'x': [2, 3], }) data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
def test_shift_assert_on_0(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3, 4, 5], 'v': ['a', 'b', 'c', 'd', 'e'], }) with pytest.raises(ValueError): # shift by zero not allowed ops = (descr(d=d).extend({ 'v_s0': 'v.shift(0)', }, order_by=['x']))
def test_parens_extend(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({'x': [1, 2, 3, 4]}) ops = (descr(d=d).extend({'y': 'x > 1 & x < 4'})) sql = data_algebra.SQLite.SQLiteModel().to_sql( ops, sql_format_options=SQLFormatOptions(annotate=False)) smushed_sql = re.sub(r'\s+', '', sql) assert '("x">1)AND("x"<4)' in smushed_sql expect = pd.DataFrame({ 'x': [1, 2, 3, 4], 'y': [False, True, True, False], }) data_algebra.test_util.check_transform(ops, data={"d": d}, expect=expect)
def test_float_divide_needed(): # need %/% as databases often use integer arithmetic pd = data_algebra.default_data_model.pd d = pd.DataFrame({'a': [1, 1, 1], 'b': [1, 2, 0]}) ops = (descr(d=d).extend({ 'r1': 'a/b', })) res_pandas = ops.transform(d) sqlite_handle = data_algebra.SQLite.example_handle() sqlite_handle.insert_table(d, table_name='d', allow_overwrite=True) res_sqlite = sqlite_handle.read_query(ops) sqlite_handle.close() assert not data_algebra.test_util.equivalent_frames(res_sqlite, res_pandas)
def test_braid(): pd = data_algebra.default_data_model.pd d_state = pd.DataFrame({ 't': [1, 3, 5], 'state': ['a', 'b', 'c'], }) d_event = pd.DataFrame({ 't': [1, 4], 'value': [10, 20], }) ops = data_algebra.solutions.braid_data( d_state=descr(d_state=d_state), d_event=descr(d_event=d_event), order_by=['t'], partition_by=[], state_value_column_name='state', event_value_column_names=['value'], stand_in_values={'state': '""', 'value': 0.0}, ).order_rows(['t', 'record_type', 'state'], reverse=['record_type']) res = ops.eval({'d_state': d_state, 'd_event': d_event}) # print(data_algebra.util.pandas_to_example_str(res)) expect = pd.DataFrame({ 't': [1, 1, 3, 4, 5], 'state': ['a', 'a', 'b', 'b', 'c'], 'value': [None, 10, None, 20, None], 'record_type': ['state_row', 'event_row', 'state_row', 'event_row', 'state_row'], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform( ops=ops, data={'d_state': d_state, 'd_event': d_event}, expect=expect, models_to_skip={ # TODO: vet on more dbs data_algebra.SparkSQL.SparkSQLModel(), data_algebra.MySQL.MySQLModel(), data_algebra.PostgreSQL.PostgreSQLModel(), data_algebra.BigQuery.BigQueryModel(), } )
def test_dag_elim_bttb(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 3], }) ops = (descr(d=d).natural_join( b=descr(d2=d), by=['x'], jointype='left', )) db_model = data_algebra.SQLite.SQLiteModel() sql = db_model.to_sql( ops, sql_format_options=data_algebra.db_model.SQLFormatOptions( use_with=False, annotate=False)) expect = pd.DataFrame({ 'x': [1, 2, 3], }) data_algebra.test_util.check_transform(ops=ops, data={ 'd': d, 'd2': d }, expect=expect)
def test_sum_one(): d = data_algebra.default_data_model.pd.DataFrame({ 'group': ['a', 'a', 'b', 'b', 'b'], 'one': [1, 1, 1, 1, 1], }) ops = (descr(d=d).project({ 'sum_one': 'one.sum()', 'sum_1': '(1).sum()', }, group_by=['group'])) expect = data_algebra.default_data_model.pd.DataFrame({ 'group': ['a', 'b'], 'sum_one': [2, 3], 'sum_1': [2, 3], }) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect)
def test_db_handle_services_to_csv(): pd = data_algebra.default_data_model.pd db_handles = data_algebra.test_util.get_test_dbs() d = pd.DataFrame({'x': [1.0, 2.0], 'y': ['a', 'b']}) td = descr(d=d) for h in db_handles: h.insert_table(d, table_name='d', allow_overwrite=True) h.drop_table('d') h.insert_table(d, table_name='d', allow_overwrite=False) with pytest.raises(ValueError): h.insert_table(d, table_name='d', allow_overwrite=False) with tempfile.NamedTemporaryFile(suffix='.csv', delete=True) as tmp: h.query_to_csv(td, res_name=tmp.name) d_back = pd.read_csv(tmp.name) assert data_algebra.test_util.equivalent_frames(d, d_back) h.drop_table('d') # clean up for h in db_handles: h.close()
def test_db_handle_servies_bq_desribe(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({'x': [1.0, 2.0], 'y': ['a', 'b']}) td = descr(d=d) if data_algebra.test_util.test_BigQuery: h = data_algebra.BigQuery.example_handle() tdb = h.insert_table(d, table_name='d', allow_overwrite=True) d_back = h.read_query(td) data_algebra.test_util.equivalent_frames(d, d_back) data_catalog, data_schema = h.db_model.table_prefix.split('.') bqd = h.describe_bq_table( table_catalog=data_catalog, table_schema=data_schema, table_name='d', row_limit=7 ) assert isinstance(bqd, TableDescription) assert set(bqd.column_names) == set(td.column_names) assert data_algebra.test_util.equivalent_frames(bqd.head, d) h.close()
def test_any_project_value(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [.1, .1, .3, .4], 'g': ['a', 'a', 'b', 'ccc'], }) ops = (descr(d=d).project({ 'new_column': 'x.any_value()' }, group_by=['g']).order_rows(['g'])) res = ops.transform(d) expect = pd.DataFrame({ 'g': ['a', 'b', 'ccc'], 'new_column': [0.1, 0.3, 0.4], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect, valid_for_empty=False) bq_sql = data_algebra.BigQuery.BigQueryModel().to_sql(ops) assert 'ANY_VALUE(`x`)' in bq_sql
def test_any_project_logical(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'a': [False, False], 'b': [False, True], 'c': [True, False], 'd': [True, True], }) ops = (descr(d=d).project( { 'any_a': 'a.any()', 'all_a': 'a.all()', 'any_b': 'b.any()', 'all_b': 'b.all()', 'any_c': 'c.any()', 'all_c': 'c.all()', 'any_d': 'd.any()', 'all_d': 'd.all()', }, group_by=[])) res = ops.transform(d) expect = pd.DataFrame({ 'any_a': [False], 'all_a': [False], 'any_b': [True], 'all_b': [False], 'any_c': [True], 'all_c': [False], 'any_d': [True], 'all_d': [True], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform(ops=ops, data=d, expect=expect, valid_for_empty=False)
def test_rank_1(): pd = data_algebra.default_data_model.pd d = pd.DataFrame({ 'x': [1, 2, 1], }) ops = (descr(d=d).extend({'x_rank': 'x.rank()'})) res = ops.transform(d) expect = pd.DataFrame({ 'x': [1, 2, 1], 'x_rank': [1.5, 3.0, 1.5], }) assert data_algebra.test_util.equivalent_frames(res, expect) data_algebra.test_util.check_transform( ops=ops, data=d, expect=expect, models_to_skip={ data_algebra.SQLite.SQLiteModel(), data_algebra.PostgreSQL.PostgreSQLModel(), data_algebra.BigQuery.BigQueryModel(), data_algebra.MySQL.MySQLModel(), data_algebra.SparkSQL.SparkSQLModel( ), # cumulative implementation (nice but not same as python) })