def test_imputation_output(feat_list, exp_imp_cols, feat_table): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute('create table states (entity_id int, as_of_date date)') for state in states_table: engine.execute('insert into states values (%s, %s)', state) feat_sql = '\n'.join( [', prefix_entity_id_1y_%s_max int' % f for f in feat_list]) engine.execute('''create table prefix_aggregation ( entity_id int , as_of_date date %s )''' % feat_sql) ins_sql = 'insert into prefix_aggregation values (%s, %s'+\ (', %s' * len(feat_list))+')' for rec in feat_table: engine.execute(ins_sql, rec) for imp in available_imputations.keys(): # skip error imputation if imp == 'error': continue for coltype in ['aggregate', 'categorical']: # only consider if not imputation_values[imp][coltype]['avail']: continue impargs = imputation_values[imp][coltype]['kwargs'] aggs = [ Aggregate(feat, ['max'], { 'coltype': coltype, 'all': dict(type=imp, **impargs) }) for feat in feat_list ] st = SpacetimeAggregation( aggregates=aggs, from_obj='prefix_events', prefix='prefix', groups=['entity_id'], intervals=['1y'], dates=['2016-01-01', '2016-02-03', '2016-03-14'], state_table='states', state_group='entity_id', date_column='as_of_date', input_min_date='2000-01-01', output_date_column='as_of_date') conn = engine.connect() trans = conn.begin() # excute query to find columns with null values and create lists of columns # that do and do not need imputation when creating the imputation table res = conn.execute(st.find_nulls()) null_counts = list(zip(res.keys(), res.fetchone())) impute_cols = [col for col, val in null_counts if val > 0] nonimpute_cols = [col for col, val in null_counts if val == 0] # sql to drop and create the imputation table drop_imp = st.get_drop(imputed=True) create_imp = st.get_impute_create( impute_cols=impute_cols, nonimpute_cols=nonimpute_cols) # create the imputation table conn.execute(drop_imp) conn.execute(create_imp) trans.commit() # check the results df = pd.read_sql('SELECT * FROM prefix_aggregation_imputed', engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) for feat in feat_list: # all of the input columns should be in the result and be null-free assert 'prefix_entity_id_1y_%s_max' % feat in df.columns.values assert df['prefix_entity_id_1y_%s_max' % feat].isnull().sum() == 0 # for non-categoricals, should add an "imputed" column and be non-null # (categoricals are expected to be handled through the null category) # zero_noflag imputation should not generate a flag either if feat in exp_imp_cols and coltype != 'categorical' and imp != 'zero_noflag': assert 'prefix_entity_id_1y_%s_max_imp' % feat in df.columns.values assert df['prefix_entity_id_1y_%s_max_imp' % feat].isnull().sum() == 0 else: # should not generate an imputed column when not needed assert 'prefix_entity_id_1y_%s_max_imp' % feat not in df.columns.values
def test_imputation_output(feat_list, exp_imp_cols, feat_table): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute("create table states (entity_id int, as_of_date date)") for state in states_table: engine.execute("insert into states values (%s, %s)", state) feat_sql = "\n".join( [", prefix_entity_id_1y_%s_max int" % f for f in feat_list]) engine.execute("""create table prefix_aggregation ( entity_id int , as_of_date date %s )""" % feat_sql) ins_sql = ("insert into prefix_aggregation values (%s, %s" + (", %s" * len(feat_list)) + ")") for rec in feat_table: engine.execute(ins_sql, rec) for imp in available_imputations.keys(): # skip error imputation if imp == "error": continue for coltype in ["aggregate", "categorical"]: # only consider if not imputation_values[imp][coltype]["avail"]: continue impargs = imputation_values[imp][coltype]["kwargs"] aggs = [ Aggregate( feat, ["max"], { "coltype": coltype, "all": dict(type=imp, **impargs) }, ) for feat in feat_list ] st = SpacetimeAggregation( aggregates=aggs, from_obj="prefix_events", prefix="prefix", groups=["entity_id"], intervals=["1y"], dates=["2016-01-01", "2016-02-03", "2016-03-14"], state_table="states", state_group="entity_id", date_column="as_of_date", input_min_date="2000-01-01", output_date_column="as_of_date", ) conn = engine.connect() trans = conn.begin() # excute query to find columns with null values and create lists of columns # that do and do not need imputation when creating the imputation table res = conn.execute(st.find_nulls()) null_counts = list(zip(res.keys(), res.fetchone())) impute_cols = [col for col, val in null_counts if val > 0] nonimpute_cols = [col for col, val in null_counts if val == 0] # sql to drop and create the imputation table drop_imp = st.get_drop(imputed=True) create_imp = st.get_impute_create( impute_cols=impute_cols, nonimpute_cols=nonimpute_cols) # create the imputation table conn.execute(drop_imp) conn.execute(create_imp) trans.commit() # check the results df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed", engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) for feat in feat_list: # all of the input columns should be in the result and be null-free assert "prefix_entity_id_1y_%s_max" % feat in df.columns.values assert df["prefix_entity_id_1y_%s_max" % feat].isnull().sum() == 0 # for non-categoricals, should add an "imputed" column and be non-null # (categoricals are expected to be handled through the null category) # zero_noflag imputation should not generate a flag either if (feat in exp_imp_cols and coltype != "categorical" and imp != "zero_noflag"): assert ("prefix_entity_id_1y_%s_imp" % feat in df.columns.values) assert (df["prefix_entity_id_1y_%s_imp" % feat].isnull().sum() == 0) else: # should not generate an imputed column when not needed assert ("prefix_entity_id_1y_%s_imp" % feat not in df.columns.values)