def test_index_column_lookup(test_engine): aggregations = [ SpacetimeAggregation( prefix="prefix1", aggregates=[ Categorical( col="cat_one", function="sum", choices=["good", "bad", "inbetween"], impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), SpacetimeAggregation( prefix="prefix2", aggregates=[ Aggregate( quantity="quantity_one", function="count", impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, ) ], groups=["entity_id", "zip_code"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), ] features_schema_name = "features" feature_generator = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ) lookup = feature_generator.index_column_lookup(aggregations) assert lookup == { "prefix1_aggregation_imputed": ["as_of_date", "entity_id"], "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"], }
def _aggregation(self, aggregation_config, feature_dates, state_table): logging.info( "Building collate.SpacetimeAggregation for config %s and %s as_of_dates", aggregation_config, len(feature_dates), ) # read top-level imputation rules from the aggregation config; we'll allow # these to be overridden by imputation rules at the individual feature # level as those get parsed as well agimp = aggregation_config.get("aggregates_imputation", {}) catimp = aggregation_config.get("categoricals_imputation", {}) arrcatimp = aggregation_config.get("array_categoricals_imputation", {}) aggregates = [ Aggregate( aggregate["quantity"], aggregate["metrics"], dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})), coltype=aggregate.get('coltype', None) ) for aggregate in aggregation_config.get("aggregates", []) ] logging.info("Found %s quantity aggregates", len(aggregates)) categoricals = self._build_categoricals( aggregation_config.get("categoricals", []), catimp ) logging.info("Found %s categorical aggregates", len(categoricals)) array_categoricals = self._build_array_categoricals( aggregation_config.get("array_categoricals", []), arrcatimp ) logging.info("Found %s array categorical aggregates", len(array_categoricals)) return SpacetimeAggregation( aggregates + categoricals + array_categoricals, from_obj=aggregation_config["from_obj"], intervals=aggregation_config["intervals"], groups=aggregation_config["groups"], dates=feature_dates, state_table=state_table, state_group=self.entity_id_column, date_column=aggregation_config["knowledge_date_column"], output_date_column="as_of_date", input_min_date=self.feature_start_time, schema=self.features_schema_name, prefix=aggregation_config["prefix"], )
def _aggregation(self, aggregation_config, feature_dates, state_table): logging.info( 'Building collate.SpacetimeAggregation for config %s and as_of_dates %s', aggregation_config, feature_dates) # read top-level imputation rules from the aggregation config; we'll allow # these to be overridden by imputation rules at the individual feature # level as those get parsed as well agimp = aggregation_config.get('aggregates_imputation', {}) catimp = aggregation_config.get('categoricals_imputation', {}) arrcatimp = aggregation_config.get('array_categoricals_imputation', {}) aggregates = [ Aggregate( aggregate['quantity'], aggregate['metrics'], dict(agimp, coltype='aggregate', **aggregate.get('imputation', {}))) for aggregate in aggregation_config.get('aggregates', []) ] logging.info('Found %s quantity aggregates', len(aggregates)) categoricals = self._build_categoricals( aggregation_config.get('categoricals', []), catimp) logging.info('Found %s categorical aggregates', len(categoricals)) array_categoricals = self._build_array_categoricals( aggregation_config.get('array_categoricals', []), arrcatimp) logging.info('Found %s array categorical aggregates', len(array_categoricals)) return SpacetimeAggregation( aggregates + categoricals + array_categoricals, from_obj=aggregation_config['from_obj'], intervals=aggregation_config['intervals'], groups=aggregation_config['groups'], dates=feature_dates, state_table=state_table, state_group=self.entity_id_column, date_column=aggregation_config['knowledge_date_column'], output_date_column='as_of_date', input_min_date=self.feature_start_time, schema=self.features_schema_name, prefix=aggregation_config['prefix'])
def test_generate_table_tasks(test_engine): test_engine.execute('create schema features') aggregations = [ SpacetimeAggregation( prefix="prefix1", aggregates=[ Categorical( col="cat_one", function="sum", choices=["good", "bad", "inbetween"], impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), SpacetimeAggregation( prefix="prefix2", aggregates=[ Aggregate( quantity="quantity_one", function="count", impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), ] features_schema_name = "features" table_tasks = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).generate_all_table_tasks(aggregations, task_type="aggregation") for table_name, task in table_tasks.items(): assert "DROP TABLE" in task["prepare"][0] assert "CREATE TABLE" in str(task["prepare"][1]) assert "CREATE INDEX" in task["finalize"][0] assert isinstance(task["inserts"], list) # build the aggregation tables to check the imputation tasks FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).process_table_tasks(table_tasks) table_tasks = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).generate_all_table_tasks(aggregations, task_type="imputation") for table_name, task in table_tasks.items(): assert "DROP TABLE" in task["prepare"][0] assert "CREATE TABLE" in str(task["prepare"][1]) assert "CREATE INDEX" in task["finalize"][0] assert isinstance(task["inserts"], list)
def test_imputation_output(feat_list, exp_imp_cols, feat_table): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute('create table states (entity_id int, as_of_date date)') for state in states_table: engine.execute('insert into states values (%s, %s)', state) feat_sql = '\n'.join( [', prefix_entity_id_1y_%s_max int' % f for f in feat_list]) engine.execute('''create table prefix_aggregation ( entity_id int , as_of_date date %s )''' % feat_sql) ins_sql = 'insert into prefix_aggregation values (%s, %s'+\ (', %s' * len(feat_list))+')' for rec in feat_table: engine.execute(ins_sql, rec) for imp in available_imputations.keys(): # skip error imputation if imp == 'error': continue for coltype in ['aggregate', 'categorical']: # only consider if not imputation_values[imp][coltype]['avail']: continue impargs = imputation_values[imp][coltype]['kwargs'] aggs = [ Aggregate(feat, ['max'], { 'coltype': coltype, 'all': dict(type=imp, **impargs) }) for feat in feat_list ] st = SpacetimeAggregation( aggregates=aggs, from_obj='prefix_events', prefix='prefix', groups=['entity_id'], intervals=['1y'], dates=['2016-01-01', '2016-02-03', '2016-03-14'], state_table='states', state_group='entity_id', date_column='as_of_date', input_min_date='2000-01-01', output_date_column='as_of_date') conn = engine.connect() trans = conn.begin() # excute query to find columns with null values and create lists of columns # that do and do not need imputation when creating the imputation table res = conn.execute(st.find_nulls()) null_counts = list(zip(res.keys(), res.fetchone())) impute_cols = [col for col, val in null_counts if val > 0] nonimpute_cols = [col for col, val in null_counts if val == 0] # sql to drop and create the imputation table drop_imp = st.get_drop(imputed=True) create_imp = st.get_impute_create( impute_cols=impute_cols, nonimpute_cols=nonimpute_cols) # create the imputation table conn.execute(drop_imp) conn.execute(create_imp) trans.commit() # check the results df = pd.read_sql('SELECT * FROM prefix_aggregation_imputed', engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) for feat in feat_list: # all of the input columns should be in the result and be null-free assert 'prefix_entity_id_1y_%s_max' % feat in df.columns.values assert df['prefix_entity_id_1y_%s_max' % feat].isnull().sum() == 0 # for non-categoricals, should add an "imputed" column and be non-null # (categoricals are expected to be handled through the null category) # zero_noflag imputation should not generate a flag either if feat in exp_imp_cols and coltype != 'categorical' and imp != 'zero_noflag': assert 'prefix_entity_id_1y_%s_max_imp' % feat in df.columns.values assert df['prefix_entity_id_1y_%s_max_imp' % feat].isnull().sum() == 0 else: # should not generate an imputed column when not needed assert 'prefix_entity_id_1y_%s_max_imp' % feat not in df.columns.values
def test_imputation_output(feat_list, exp_imp_cols, feat_table): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute("create table states (entity_id int, as_of_date date)") for state in states_table: engine.execute("insert into states values (%s, %s)", state) feat_sql = "\n".join( [", prefix_entity_id_1y_%s_max int" % f for f in feat_list]) engine.execute("""create table prefix_aggregation ( entity_id int , as_of_date date %s )""" % feat_sql) ins_sql = ("insert into prefix_aggregation values (%s, %s" + (", %s" * len(feat_list)) + ")") for rec in feat_table: engine.execute(ins_sql, rec) for imp in available_imputations.keys(): # skip error imputation if imp == "error": continue for coltype in ["aggregate", "categorical"]: # only consider if not imputation_values[imp][coltype]["avail"]: continue impargs = imputation_values[imp][coltype]["kwargs"] aggs = [ Aggregate( feat, ["max"], { "coltype": coltype, "all": dict(type=imp, **impargs) }, ) for feat in feat_list ] st = SpacetimeAggregation( aggregates=aggs, from_obj="prefix_events", prefix="prefix", groups=["entity_id"], intervals=["1y"], dates=["2016-01-01", "2016-02-03", "2016-03-14"], state_table="states", state_group="entity_id", date_column="as_of_date", input_min_date="2000-01-01", output_date_column="as_of_date", ) conn = engine.connect() trans = conn.begin() # excute query to find columns with null values and create lists of columns # that do and do not need imputation when creating the imputation table res = conn.execute(st.find_nulls()) null_counts = list(zip(res.keys(), res.fetchone())) impute_cols = [col for col, val in null_counts if val > 0] nonimpute_cols = [col for col, val in null_counts if val == 0] # sql to drop and create the imputation table drop_imp = st.get_drop(imputed=True) create_imp = st.get_impute_create( impute_cols=impute_cols, nonimpute_cols=nonimpute_cols) # create the imputation table conn.execute(drop_imp) conn.execute(create_imp) trans.commit() # check the results df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed", engine) # we should have a record for every entity/date combo assert df.shape[0] == len(states_table) for feat in feat_list: # all of the input columns should be in the result and be null-free assert "prefix_entity_id_1y_%s_max" % feat in df.columns.values assert df["prefix_entity_id_1y_%s_max" % feat].isnull().sum() == 0 # for non-categoricals, should add an "imputed" column and be non-null # (categoricals are expected to be handled through the null category) # zero_noflag imputation should not generate a flag either if (feat in exp_imp_cols and coltype != "categorical" and imp != "zero_noflag"): assert ("prefix_entity_id_1y_%s_imp" % feat in df.columns.values) assert (df["prefix_entity_id_1y_%s_imp" % feat].isnull().sum() == 0) else: # should not generate an imputed column when not needed assert ("prefix_entity_id_1y_%s_imp" % feat not in df.columns.values)
def test_basic_spacetime(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute( "create table events (entity_id int, event_date date, outcome bool)" ) for event in events_data: engine.execute("insert into events values (%s, %s, %s::bool)", event) engine.execute("create table states (entity_id int, as_of_date date)") for state in state_data: engine.execute("insert into states values (%s, %s)", state) agg = Aggregate( "outcome::int", ["sum", "avg"], { "coltype": "aggregate", "avg": {"type": "mean"}, "sum": {"type": "constant", "value": 3}, "max": {"type": "zero"}, }, ) st = SpacetimeAggregation( aggregates=[agg], from_obj="events", groups=["entity_id"], intervals=["1y", "2y", "all"], dates=["2016-01-01", "2015-01-01"], state_table="states", state_group="entity_id", date_column="event_date", output_date_column="as_of_date", ) st.execute(engine.connect()) r = engine.execute( "select * from events_entity_id order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[0]["entity_id"] == 1 assert rows[0]["as_of_date"] == date(2015, 1, 1) assert rows[0]["events_entity_id_1y_outcome::int_sum"] == 1 assert rows[0]["events_entity_id_1y_outcome::int_avg"] == 0.5 assert rows[0]["events_entity_id_2y_outcome::int_sum"] == 1 assert rows[0]["events_entity_id_2y_outcome::int_avg"] == 0.5 assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[1]["entity_id"] == 1 assert rows[1]["as_of_date"] == date(2016, 1, 1) assert rows[1]["events_entity_id_1y_outcome::int_sum"] == 1 assert rows[1]["events_entity_id_1y_outcome::int_avg"] == 0.5 assert rows[1]["events_entity_id_2y_outcome::int_sum"] == 2 assert rows[1]["events_entity_id_2y_outcome::int_avg"] == 0.5 assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2 assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[2]["entity_id"] == 2 assert rows[2]["as_of_date"] == date(2015, 1, 1) assert rows[2]["events_entity_id_1y_outcome::int_sum"] == 0 assert rows[2]["events_entity_id_1y_outcome::int_avg"] == 0 assert rows[2]["events_entity_id_2y_outcome::int_sum"] == 1 assert rows[2]["events_entity_id_2y_outcome::int_avg"] == 0.5 assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[3]["entity_id"] == 2 assert rows[3]["as_of_date"] == date(2016, 1, 1) assert rows[3]["events_entity_id_1y_outcome::int_sum"] is None assert rows[3]["events_entity_id_1y_outcome::int_avg"] is None assert rows[3]["events_entity_id_2y_outcome::int_sum"] == 0 assert rows[3]["events_entity_id_2y_outcome::int_avg"] == 0 assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[4]["entity_id"] == 3 assert rows[4]["as_of_date"] == date(2015, 1, 1) assert rows[4]["events_entity_id_1y_outcome::int_sum"] == 0 assert rows[4]["events_entity_id_1y_outcome::int_avg"] == 0 assert rows[4]["events_entity_id_2y_outcome::int_sum"] == 0 assert rows[4]["events_entity_id_2y_outcome::int_avg"] == 0 assert rows[4]["events_entity_id_all_outcome::int_sum"] == 0 assert rows[4]["events_entity_id_all_outcome::int_avg"] == 0 assert rows[5]["entity_id"] == 3 assert rows[5]["as_of_date"] == date(2016, 1, 1) assert rows[5]["events_entity_id_1y_outcome::int_sum"] == 1 assert rows[5]["events_entity_id_1y_outcome::int_avg"] == 0.5 assert rows[5]["events_entity_id_2y_outcome::int_sum"] == 1 assert rows[5]["events_entity_id_2y_outcome::int_avg"] == 0.25 assert rows[5]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[5]["events_entity_id_all_outcome::int_avg"] == 0.25 assert rows[6]["entity_id"] == 4 # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data! assert rows[6]["as_of_date"] == date(2016, 1, 1) assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 0 assert rows[6]["events_entity_id_1y_outcome::int_avg"] == 0 assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 0 assert rows[6]["events_entity_id_2y_outcome::int_avg"] == 0 assert rows[6]["events_entity_id_all_outcome::int_sum"] == 0 assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0 assert len(rows) == 7 # check some imputation results r = engine.execute( "select * from events_aggregation_imputed order by entity_id, as_of_date" ) rows = [x for x in r] assert rows[6]["entity_id"] == 4 assert rows[6]["as_of_date"] == date(2015, 1, 1) assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 3 assert rows[6]["events_entity_id_1y_outcome::int_sum_imp"] == 1 assert ( round(float(rows[6]["events_entity_id_1y_outcome::int_avg"]), 4) == 0.1667 ) assert rows[6]["events_entity_id_1y_outcome::int_avg_imp"] == 1 assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 3 assert rows[6]["events_entity_id_2y_outcome::int_sum_imp"] == 1 assert ( round(float(rows[6]["events_entity_id_2y_outcome::int_avg"]), 4) == 0.3333 ) assert rows[6]["events_entity_id_2y_outcome::int_avg_imp"] == 1 assert rows[6]["events_entity_id_all_outcome::int_sum"] == 3 assert rows[6]["events_entity_id_all_outcome::int_sum_imp"] == 1 assert ( round(float(rows[6]["events_entity_id_all_outcome::int_avg"]), 4) == 0.3333 ) assert rows[6]["events_entity_id_all_outcome::int_avg_imp"] == 1 assert rows[7]["entity_id"] == 4 assert rows[7]["as_of_date"] == date(2016, 1, 1) assert rows[7]["events_entity_id_1y_outcome::int_sum"] == 0 assert rows[7]["events_entity_id_1y_outcome::int_sum_imp"] == 0 assert rows[7]["events_entity_id_1y_outcome::int_avg"] == 0 assert rows[7]["events_entity_id_1y_outcome::int_avg_imp"] == 0 assert rows[7]["events_entity_id_2y_outcome::int_sum"] == 0 assert rows[7]["events_entity_id_2y_outcome::int_sum_imp"] == 0 assert rows[7]["events_entity_id_2y_outcome::int_avg"] == 0 assert rows[7]["events_entity_id_2y_outcome::int_avg_imp"] == 0 assert rows[7]["events_entity_id_all_outcome::int_sum"] == 0 assert rows[7]["events_entity_id_all_outcome::int_sum_imp"] == 0 assert rows[7]["events_entity_id_all_outcome::int_avg"] == 0 assert rows[7]["events_entity_id_all_outcome::int_avg_imp"] == 0 assert len(rows) == 8
def test_input_min_date(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute("create table events (entity_id int, date date, outcome bool)") for event in events_data: engine.execute("insert into events values (%s, %s, %s::bool)", event) engine.execute("create table states (entity_id int, date date)") for state in state_data: engine.execute("insert into states values (%s, %s)", state) agg = Aggregate( "outcome::int", ["sum", "avg"], { "coltype": "aggregate", "avg": {"type": "mean"}, "sum": {"type": "constant", "value": 3}, "max": {"type": "zero"}, }, ) st = SpacetimeAggregation( aggregates=[agg], from_obj="events", groups=["entity_id"], intervals=["all"], dates=["2016-01-01"], state_table="states", state_group="entity_id", date_column='"date"', input_min_date="2015-11-10", ) st.execute(engine.connect()) r = engine.execute("select * from events_entity_id order by entity_id") rows = [x for x in r] assert rows[0]["entity_id"] == 1 assert rows[0]["date"] == date(2016, 1, 1) assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[0]["events_entity_id_all_outcome::int_avg"] == 1 assert rows[1]["entity_id"] == 4 assert rows[1]["date"] == date(2016, 1, 1) assert rows[1]["events_entity_id_all_outcome::int_sum"] == 0 assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0 assert len(rows) == 2 st = SpacetimeAggregation( aggregates=[agg], from_obj="events", groups=["entity_id"], intervals=["1y", "all"], dates=["2016-01-01", "2015-01-01"], state_table="states", state_group="entity_id", date_column='"date"', input_min_date="2014-11-10", ) with pytest.raises(ValueError): st.validate(engine.connect()) with pytest.raises(ValueError): st.execute(engine.connect())
def test_join_with_cohort_table(db_engine): # if we specify joining with the cohort table # only entity_id/date pairs in the cohort table should show up db_engine.execute("create table events (entity_id int, date date, outcome bool)") for event in events_data: db_engine.execute("insert into events values (%s, %s, %s::bool)", event) db_engine.execute("create table cohort (entity_id int, date date)") # use the states list from above except only include entities 1 and 2 in the cohort smaller_cohort = sorted( product( set([l[0] for l in events_data if l[0] == 1 or l[0] == 2]), set([l[1] for l in events_data] + [date(2016, 1, 1)]), ) ) for state in smaller_cohort: db_engine.execute("insert into cohort values (%s, %s)", state) # create our test aggregation with the important 'join_with_cohort_table' flag agg = Aggregate( "outcome::int", ["sum", "avg"], { "coltype": "aggregate", "avg": {"type": "mean"}, "sum": {"type": "constant", "value": 3}, "max": {"type": "zero"}, }, ) st = SpacetimeAggregation( aggregates=[agg], from_obj="events", groups=["entity_id"], intervals=["all"], dates=["2016-01-01", "2015-01-01"], state_table="cohort", state_group="entity_id", date_column='"date"', join_with_cohort_table=True, ) st.execute(db_engine.connect()) r = db_engine.execute("select * from events_entity_id order by entity_id, date") rows = [x for x in r] # these rows should be similar to the rows in the basic spacetime test, # except only the rows for entities 1 and 2 are present assert len(rows) == 4 assert rows[0]["entity_id"] == 1 assert rows[0]["date"] == date(2015, 1, 1) assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[1]["entity_id"] == 1 assert rows[1]["date"] == date(2016, 1, 1) assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2 assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[2]["entity_id"] == 2 assert rows[2]["date"] == date(2015, 1, 1) assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5 assert rows[3]["entity_id"] == 2 assert rows[3]["date"] == date(2016, 1, 1) assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1 assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5
def test_generate_table_tasks(): aggregations = [ SpacetimeAggregation(prefix='prefix1', aggregates=[ Categorical( col='cat_one', function='sum', choices=['good', 'bad', 'inbetween'], impute_rules={ 'coltype': 'categorical', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data'), SpacetimeAggregation(prefix='prefix2', aggregates=[ Aggregate(quantity='quantity_one', function='count', impute_rules={ 'coltype': 'aggregate', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data') ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks(aggregations, task_type='aggregation') for table_name, task in table_tasks.items(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list) # build the aggregation tables to check the imputation tasks FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name).process_table_tasks( table_tasks) table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks(aggregations, task_type='imputation') for table_name, task in table_tasks.items(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list)
def test_index_column_lookup(): aggregations = [ SpacetimeAggregation(prefix='prefix1', aggregates=[ Categorical( col='cat_one', function='sum', choices=['good', 'bad', 'inbetween'], impute_rules={ 'coltype': 'categorical', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data'), SpacetimeAggregation(prefix='prefix2', aggregates=[ Aggregate(quantity='quantity_one', function='count', impute_rules={ 'coltype': 'aggregate', 'all': { 'type': 'zero' } }) ], groups=['entity_id', 'zip_code'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data') ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' feature_generator = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name) lookup = feature_generator.index_column_lookup(aggregations) assert lookup == { 'prefix1_aggregation_imputed': [ 'as_of_date', 'entity_id', ], 'prefix2_aggregation_imputed': ['as_of_date', 'entity_id', 'zip_code'] }
def test_basic_spacetime(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute( 'create table events (entity_id int, event_date date, outcome bool)' ) for event in events_data: engine.execute( 'insert into events values (%s, %s, %s::bool)', event ) engine.execute( 'create table states (entity_id int, as_of_date date)' ) for state in state_data: engine.execute( 'insert into states values (%s, %s)', state ) agg = Aggregate('outcome::int', ['sum','avg'], { "coltype": "aggregate", "avg": {"type": "mean"}, "sum": {"type": "constant", "value": 3}, "max": {"type": "zero"} }) st = SpacetimeAggregation( aggregates = [agg], from_obj = 'events', groups = ['entity_id'], intervals = ['1y', '2y', 'all'], dates = ['2016-01-01', '2015-01-01'], state_table = 'states', state_group = 'entity_id', date_column = 'event_date', output_date_column = 'as_of_date' ) st.execute(engine.connect()) r = engine.execute('select * from events_entity_id order by entity_id, as_of_date') rows = [x for x in r] assert rows[0]['entity_id'] == 1 assert rows[0]['as_of_date'] == date(2015, 1, 1) assert rows[0]['events_entity_id_1y_outcome::int_sum'] == 1 assert rows[0]['events_entity_id_1y_outcome::int_avg'] == 0.5 assert rows[0]['events_entity_id_2y_outcome::int_sum'] == 1 assert rows[0]['events_entity_id_2y_outcome::int_avg'] == 0.5 assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1 assert rows[0]['events_entity_id_all_outcome::int_avg'] == 0.5 assert rows[1]['entity_id'] == 1 assert rows[1]['as_of_date'] == date(2016, 1, 1) assert rows[1]['events_entity_id_1y_outcome::int_sum'] == 1 assert rows[1]['events_entity_id_1y_outcome::int_avg'] == 0.5 assert rows[1]['events_entity_id_2y_outcome::int_sum'] == 2 assert rows[1]['events_entity_id_2y_outcome::int_avg'] == 0.5 assert rows[1]['events_entity_id_all_outcome::int_sum'] == 2 assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0.5 assert rows[2]['entity_id'] == 2 assert rows[2]['as_of_date'] == date(2015, 1, 1) assert rows[2]['events_entity_id_1y_outcome::int_sum'] == 0 assert rows[2]['events_entity_id_1y_outcome::int_avg'] == 0 assert rows[2]['events_entity_id_2y_outcome::int_sum'] == 1 assert rows[2]['events_entity_id_2y_outcome::int_avg'] == 0.5 assert rows[2]['events_entity_id_all_outcome::int_sum'] == 1 assert rows[2]['events_entity_id_all_outcome::int_avg'] == 0.5 assert rows[3]['entity_id'] == 2 assert rows[3]['as_of_date'] == date(2016, 1, 1) assert rows[3]['events_entity_id_1y_outcome::int_sum'] == None assert rows[3]['events_entity_id_1y_outcome::int_avg'] == None assert rows[3]['events_entity_id_2y_outcome::int_sum'] == 0 assert rows[3]['events_entity_id_2y_outcome::int_avg'] == 0 assert rows[3]['events_entity_id_all_outcome::int_sum'] == 1 assert rows[3]['events_entity_id_all_outcome::int_avg'] == 0.5 assert rows[4]['entity_id'] == 3 assert rows[4]['as_of_date'] == date(2015, 1, 1) assert rows[4]['events_entity_id_1y_outcome::int_sum'] == 0 assert rows[4]['events_entity_id_1y_outcome::int_avg'] == 0 assert rows[4]['events_entity_id_2y_outcome::int_sum'] == 0 assert rows[4]['events_entity_id_2y_outcome::int_avg'] == 0 assert rows[4]['events_entity_id_all_outcome::int_sum'] == 0 assert rows[4]['events_entity_id_all_outcome::int_avg'] == 0 assert rows[5]['entity_id'] == 3 assert rows[5]['as_of_date'] == date(2016, 1, 1) assert rows[5]['events_entity_id_1y_outcome::int_sum'] == 1 assert rows[5]['events_entity_id_1y_outcome::int_avg'] == 0.5 assert rows[5]['events_entity_id_2y_outcome::int_sum'] == 1 assert rows[5]['events_entity_id_2y_outcome::int_avg'] == 0.25 assert rows[5]['events_entity_id_all_outcome::int_sum'] == 1 assert rows[5]['events_entity_id_all_outcome::int_avg'] == 0.25 assert rows[6]['entity_id'] == 4 # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data! assert rows[6]['as_of_date'] == date(2016, 1, 1) assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 0 assert rows[6]['events_entity_id_1y_outcome::int_avg'] == 0 assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 0 assert rows[6]['events_entity_id_2y_outcome::int_avg'] == 0 assert rows[6]['events_entity_id_all_outcome::int_sum'] == 0 assert rows[6]['events_entity_id_all_outcome::int_avg'] == 0 assert len(rows) == 7 # check some imputation results r = engine.execute('select * from events_aggregation_imputed order by entity_id, as_of_date') rows = [x for x in r] assert rows[6]['entity_id'] == 4 assert rows[6]['as_of_date'] == date(2015, 1, 1) assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 3 assert rows[6]['events_entity_id_1y_outcome::int_sum_imp'] == 1 assert round(float(rows[6]['events_entity_id_1y_outcome::int_avg']), 4) == 0.1667 assert rows[6]['events_entity_id_1y_outcome::int_avg_imp'] == 1 assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 3 assert rows[6]['events_entity_id_2y_outcome::int_sum_imp'] == 1 assert round(float(rows[6]['events_entity_id_2y_outcome::int_avg']), 4) == 0.3333 assert rows[6]['events_entity_id_2y_outcome::int_avg_imp'] == 1 assert rows[6]['events_entity_id_all_outcome::int_sum'] == 3 assert rows[6]['events_entity_id_all_outcome::int_sum_imp'] == 1 assert round(float(rows[6]['events_entity_id_all_outcome::int_avg']), 4) == 0.3333 assert rows[6]['events_entity_id_all_outcome::int_avg_imp'] == 1 assert rows[7]['entity_id'] == 4 assert rows[7]['as_of_date'] == date(2016, 1, 1) assert rows[7]['events_entity_id_1y_outcome::int_sum'] == 0 assert rows[7]['events_entity_id_1y_outcome::int_sum_imp'] == 0 assert rows[7]['events_entity_id_1y_outcome::int_avg'] == 0 assert rows[7]['events_entity_id_1y_outcome::int_avg_imp'] == 0 assert rows[7]['events_entity_id_2y_outcome::int_sum'] == 0 assert rows[7]['events_entity_id_2y_outcome::int_sum_imp'] == 0 assert rows[7]['events_entity_id_2y_outcome::int_avg'] == 0 assert rows[7]['events_entity_id_2y_outcome::int_avg_imp'] == 0 assert rows[7]['events_entity_id_all_outcome::int_sum'] == 0 assert rows[7]['events_entity_id_all_outcome::int_sum_imp'] == 0 assert rows[7]['events_entity_id_all_outcome::int_avg'] == 0 assert rows[7]['events_entity_id_all_outcome::int_avg_imp'] == 0 assert len(rows) == 8
def test_input_min_date(): with testing.postgresql.Postgresql() as psql: engine = sqlalchemy.create_engine(psql.url()) engine.execute( 'create table events (entity_id int, date date, outcome bool)' ) for event in events_data: engine.execute( 'insert into events values (%s, %s, %s::bool)', event ) engine.execute( 'create table states (entity_id int, date date)' ) for state in state_data: engine.execute( 'insert into states values (%s, %s)', state ) agg = Aggregate('outcome::int', ['sum','avg'], { "coltype": "aggregate", "avg": {"type": "mean"}, "sum": {"type": "constant", "value": 3}, "max": {"type": "zero"} }) st = SpacetimeAggregation( aggregates = [agg], from_obj = 'events', groups = ['entity_id'], intervals = ['all'], dates = ['2016-01-01'], state_table = 'states', state_group = 'entity_id', date_column = '"date"', input_min_date = '2015-11-10' ) st.execute(engine.connect()) r = engine.execute('select * from events_entity_id order by entity_id') rows = [x for x in r] assert rows[0]['entity_id'] == 1 assert rows[0]['date'] == date(2016, 1, 1) assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1 assert rows[0]['events_entity_id_all_outcome::int_avg'] == 1 assert rows[1]['entity_id'] == 4 assert rows[1]['date'] == date(2016, 1, 1) assert rows[1]['events_entity_id_all_outcome::int_sum'] == 0 assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0 assert len(rows) == 2 st = SpacetimeAggregation( aggregates = [agg], from_obj = 'events', groups = ['entity_id'], intervals = ['1y', 'all'], dates = ['2016-01-01', '2015-01-01'], state_table = 'states', state_group = 'entity_id', date_column = '"date"', input_min_date = '2014-11-10' ) with pytest.raises(ValueError): st.validate(engine.connect()) with pytest.raises(ValueError): st.execute(engine.connect())