Пример #1
0
def test_generate_table_tasks():
    aggregate_config = [{
        'prefix':
        'prefix1',
        'categoricals': [
            {
                'column': 'cat_one',
                'choice_query': 'select distinct(cat_one) from data',
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column':
        'knowledge_date',
        'from_obj':
        'data'
    }, {
        'prefix':
        'prefix2',
        'aggregates': [
            {
                'quantity': 'quantity_one',
                'metrics': ['count']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column':
        'knowledge_date',
        'from_obj':
        'data'
    }]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'

        table_tasks = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name
        ).generate_all_table_tasks(
            feature_dates=['2013-09-30', '2014-09-30'],
            feature_aggregation_config=aggregate_config,
        )
        for task in table_tasks.values():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)
Пример #2
0
def test_replace():
    aggregate_config = [{
        'prefix':
        'aprefix',
        'aggregates': [
            {
                'quantity': 'quantity_one',
                'metrics': ['sum', 'count']
            },
        ],
        'categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column':
        'knowledge_date',
        'from_obj':
        'data'
    }]

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        feature_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False).create_all_tables(
                feature_dates=['2013-09-30', '2014-09-30'],
                feature_aggregation_config=aggregate_config,
            )

        assert len(feature_tables) == 1

        table_tasks = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name,
            replace=False).generate_all_table_tasks(
                feature_dates=['2013-09-30', '2014-09-30'],
                feature_aggregation_config=aggregate_config,
            )

        assert len(table_tasks['aprefix_entity_id'].keys()) == 0
Пример #3
0
    def run(self):
        # 1. generate temporal splits
        for split in self.temporal_splits():

            # 2. create labels
            labels_table = LabelGenerator(
                events_table=self.config['events_table'],
                start_date=split['train_start'],
                end_date=split['train_end'],
                split=split,
                db_engine=self.db_engine
            ).generate()

            # 3. generate features
            features_table = FeatureGenerator(
                feature_aggregations=self.config['feature_aggregations'],
                feature_dates=split['feature_dates'],
                data_table=self.config['data_table'],
                db_engine=self.db_engine
            ).generate()
Пример #4
0
def test_feature_generation():
    aggregate_config = [{
        'prefix':
        'aprefix',
        'aggregates': [
            {
                'quantity': 'quantity_one',
                'metrics': ['sum', 'count']
            },
        ],
        'categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column':
        'knowledge_date',
        'from_obj':
        'data'
    }]

    expected_output = {
        'aprefix_entity_id': [
            {
                'entity_id': 3,
                'as_of_date': date(2013, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 342,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 1,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 10000,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 1,
                'aprefix_entity_id_all_cat_one_bad_sum': 0
            },
            {
                'entity_id': 3,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 342,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 4,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_quantity_one_sum': 1236,
                'aprefix_entity_id_all_quantity_one_count': 1,
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
        ]
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        output_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name).create_all_tables(
                feature_dates=['2013-09-30', '2014-09-30'],
                feature_aggregation_config=aggregate_config,
            )

        for output_table in output_tables:
            records = pandas.read_sql(
                'select * from {}.{} order by as_of_date, entity_id'.format(
                    features_schema_name, output_table),
                engine).to_dict('records')
            assert records == expected_output[output_table]
Пример #5
0
def test_array_categoricals():
    aggregate_config = [{
        'prefix':
        'aprefix',
        'array_categoricals': [
            {
                'column': 'cat_one',
                'choices': ['good', 'bad', 'inbetween'],
                'metrics': ['sum']
            },
        ],
        'groups': ['entity_id'],
        'intervals': ['all'],
        'knowledge_date_column':
        'knowledge_date',
        'from_obj':
        'data'
    }]
    expected_output = {
        'aprefix_entity_id': [
            {
                'entity_id': 3,
                'as_of_date': date(2013, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 1,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 1,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 0
            },
            {
                'entity_id': 3,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
            {
                'entity_id': 4,
                'as_of_date': date(2014, 9, 30),
                'aprefix_entity_id_all_cat_one_good_sum': 0,
                'aprefix_entity_id_all_cat_one_inbetween_sum': 0,
                'aprefix_entity_id_all_cat_one_bad_sum': 1
            },
        ]
    }

    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        input_data = [
            # entity_id, knowledge_date, cat_one, quantity_one
            (1, date(2014, 1, 1), ['good', 'good'], 10000),
            (1, date(2014, 10, 11), ['good'], None),
            (3, date(2012, 6, 8), ['bad'], 342),
            (3, date(2014, 12, 21), ['inbetween'], 600),
            (4, date(2014, 4, 4), ['bad'], 1236)
        ]

        engine.execute("""
            create table data (
                entity_id int,
                knowledge_date date,
                cat_one varchar[],
                quantity_one float
            )
        """)
        for row in input_data:
            engine.execute('insert into data values (%s, %s, %s, %s)', row)

        features_schema_name = 'features'

        output_tables = FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name).create_all_tables(
                feature_dates=['2013-09-30', '2014-09-30'],
                feature_aggregation_config=aggregate_config,
            )

        for output_table in output_tables:
            records = pandas.read_sql(
                'select * from {}.{} order by as_of_date, entity_id'.format(
                    features_schema_name, output_table),
                engine).to_dict('records')
            assert records == expected_output[output_table]