def test_generate_table_tasks(): aggregate_config = [{ 'prefix': 'prefix1', 'categoricals': [ { 'column': 'cat_one', 'choice_query': 'select distinct(cat_one) from data', 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }, { 'prefix': 'prefix2', 'aggregates': [ { 'quantity': 'quantity_one', 'metrics': ['count'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for task in table_tasks.values(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list)
def test_replace(): aggregate_config = [{ 'prefix': 'aprefix', 'aggregates': [ { 'quantity': 'quantity_one', 'metrics': ['sum', 'count'] }, ], 'categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' feature_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) assert len(feature_tables) == 1 table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name, replace=False).generate_all_table_tasks( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) assert len(table_tasks['aprefix_entity_id'].keys()) == 0
def run(self): # 1. generate temporal splits for split in self.temporal_splits(): # 2. create labels labels_table = LabelGenerator( events_table=self.config['events_table'], start_date=split['train_start'], end_date=split['train_end'], split=split, db_engine=self.db_engine ).generate() # 3. generate features features_table = FeatureGenerator( feature_aggregations=self.config['feature_aggregations'], feature_dates=split['feature_dates'], data_table=self.config['data_table'], db_engine=self.db_engine ).generate()
def test_feature_generation(): aggregate_config = [{ 'prefix': 'aprefix', 'aggregates': [ { 'quantity': 'quantity_one', 'metrics': ['sum', 'count'] }, ], 'categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] expected_output = { 'aprefix_entity_id': [ { 'entity_id': 3, 'as_of_date': date(2013, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 342, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 1, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 10000, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 1, 'aprefix_entity_id_all_cat_one_bad_sum': 0 }, { 'entity_id': 3, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 342, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 4, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_quantity_one_sum': 1236, 'aprefix_entity_id_all_quantity_one_count': 1, 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, ] } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' output_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for output_table in output_tables: records = pandas.read_sql( 'select * from {}.{} order by as_of_date, entity_id'.format( features_schema_name, output_table), engine).to_dict('records') assert records == expected_output[output_table]
def test_array_categoricals(): aggregate_config = [{ 'prefix': 'aprefix', 'array_categoricals': [ { 'column': 'cat_one', 'choices': ['good', 'bad', 'inbetween'], 'metrics': ['sum'] }, ], 'groups': ['entity_id'], 'intervals': ['all'], 'knowledge_date_column': 'knowledge_date', 'from_obj': 'data' }] expected_output = { 'aprefix_entity_id': [ { 'entity_id': 3, 'as_of_date': date(2013, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 1, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 1, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 0 }, { 'entity_id': 3, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, { 'entity_id': 4, 'as_of_date': date(2014, 9, 30), 'aprefix_entity_id_all_cat_one_good_sum': 0, 'aprefix_entity_id_all_cat_one_inbetween_sum': 0, 'aprefix_entity_id_all_cat_one_bad_sum': 1 }, ] } with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) input_data = [ # entity_id, knowledge_date, cat_one, quantity_one (1, date(2014, 1, 1), ['good', 'good'], 10000), (1, date(2014, 10, 11), ['good'], None), (3, date(2012, 6, 8), ['bad'], 342), (3, date(2014, 12, 21), ['inbetween'], 600), (4, date(2014, 4, 4), ['bad'], 1236) ] engine.execute(""" create table data ( entity_id int, knowledge_date date, cat_one varchar[], quantity_one float ) """) for row in input_data: engine.execute('insert into data values (%s, %s, %s, %s)', row) features_schema_name = 'features' output_tables = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name).create_all_tables( feature_dates=['2013-09-30', '2014-09-30'], feature_aggregation_config=aggregate_config, ) for output_table in output_tables: records = pandas.read_sql( 'select * from {}.{} order by as_of_date, entity_id'.format( features_schema_name, output_table), engine).to_dict('records') assert records == expected_output[output_table]