예제 #1
0
def test_index_column_lookup(test_engine):
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id", "zip_code"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]

    features_schema_name = "features"
    feature_generator = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    )
    lookup = feature_generator.index_column_lookup(aggregations)
    assert lookup == {
        "prefix1_aggregation_imputed": ["as_of_date", "entity_id"],
        "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"],
    }
예제 #2
0
    def _aggregation(self, aggregation_config, feature_dates, state_table):
        logging.info(
            "Building collate.SpacetimeAggregation for config %s and %s as_of_dates",
            aggregation_config,
            len(feature_dates),
        )

        # read top-level imputation rules from the aggregation config; we'll allow
        # these to be overridden by imputation rules at the individual feature
        # level as those get parsed as well
        agimp = aggregation_config.get("aggregates_imputation", {})
        catimp = aggregation_config.get("categoricals_imputation", {})
        arrcatimp = aggregation_config.get("array_categoricals_imputation", {})

        aggregates = [
            Aggregate(
                aggregate["quantity"],
                aggregate["metrics"],
                dict(agimp, coltype="aggregate", **aggregate.get("imputation", {})),
                coltype=aggregate.get('coltype', None)
            )
            for aggregate in aggregation_config.get("aggregates", [])
        ]
        logging.info("Found %s quantity aggregates", len(aggregates))
        categoricals = self._build_categoricals(
            aggregation_config.get("categoricals", []), catimp
        )
        logging.info("Found %s categorical aggregates", len(categoricals))
        array_categoricals = self._build_array_categoricals(
            aggregation_config.get("array_categoricals", []), arrcatimp
        )
        logging.info("Found %s array categorical aggregates", len(array_categoricals))
        return SpacetimeAggregation(
            aggregates + categoricals + array_categoricals,
            from_obj=aggregation_config["from_obj"],
            intervals=aggregation_config["intervals"],
            groups=aggregation_config["groups"],
            dates=feature_dates,
            state_table=state_table,
            state_group=self.entity_id_column,
            date_column=aggregation_config["knowledge_date_column"],
            output_date_column="as_of_date",
            input_min_date=self.feature_start_time,
            schema=self.features_schema_name,
            prefix=aggregation_config["prefix"],
        )
예제 #3
0
    def _aggregation(self, aggregation_config, feature_dates, state_table):
        logging.info(
            'Building collate.SpacetimeAggregation for config %s and as_of_dates %s',
            aggregation_config, feature_dates)

        # read top-level imputation rules from the aggregation config; we'll allow
        # these to be overridden by imputation rules at the individual feature
        # level as those get parsed as well
        agimp = aggregation_config.get('aggregates_imputation', {})
        catimp = aggregation_config.get('categoricals_imputation', {})
        arrcatimp = aggregation_config.get('array_categoricals_imputation', {})

        aggregates = [
            Aggregate(
                aggregate['quantity'], aggregate['metrics'],
                dict(agimp,
                     coltype='aggregate',
                     **aggregate.get('imputation', {})))
            for aggregate in aggregation_config.get('aggregates', [])
        ]
        logging.info('Found %s quantity aggregates', len(aggregates))
        categoricals = self._build_categoricals(
            aggregation_config.get('categoricals', []), catimp)
        logging.info('Found %s categorical aggregates', len(categoricals))
        array_categoricals = self._build_array_categoricals(
            aggregation_config.get('array_categoricals', []), arrcatimp)
        logging.info('Found %s array categorical aggregates',
                     len(array_categoricals))
        return SpacetimeAggregation(
            aggregates + categoricals + array_categoricals,
            from_obj=aggregation_config['from_obj'],
            intervals=aggregation_config['intervals'],
            groups=aggregation_config['groups'],
            dates=feature_dates,
            state_table=state_table,
            state_group=self.entity_id_column,
            date_column=aggregation_config['knowledge_date_column'],
            output_date_column='as_of_date',
            input_min_date=self.feature_start_time,
            schema=self.features_schema_name,
            prefix=aggregation_config['prefix'])
예제 #4
0
def test_generate_table_tasks(test_engine):
    test_engine.execute('create schema features')
    aggregations = [
        SpacetimeAggregation(
            prefix="prefix1",
            aggregates=[
                Categorical(
                    col="cat_one",
                    function="sum",
                    choices=["good", "bad", "inbetween"],
                    impute_rules={"coltype": "categorical", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
        SpacetimeAggregation(
            prefix="prefix2",
            aggregates=[
                Aggregate(
                    quantity="quantity_one",
                    function="count",
                    impute_rules={"coltype": "aggregate", "all": {"type": "zero"}},
                )
            ],
            groups=["entity_id"],
            intervals=["all"],
            date_column="knowledge_date",
            output_date_column="as_of_date",
            dates=["2013-09-30", "2014-09-30"],
            state_table="states",
            state_group="entity_id",
            schema="features",
            from_obj="data",
        ),
    ]
    features_schema_name = "features"

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="aggregation")
    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)

    # build the aggregation tables to check the imputation tasks
    FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).process_table_tasks(table_tasks)

    table_tasks = FeatureGenerator(
        db_engine=test_engine,
        features_schema_name=features_schema_name,
    ).generate_all_table_tasks(aggregations, task_type="imputation")

    for table_name, task in table_tasks.items():
        assert "DROP TABLE" in task["prepare"][0]
        assert "CREATE TABLE" in str(task["prepare"][1])
        assert "CREATE INDEX" in task["finalize"][0]
        assert isinstance(task["inserts"], list)
예제 #5
0
def test_imputation_output(feat_list, exp_imp_cols, feat_table):
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())

        engine.execute('create table states (entity_id int, as_of_date date)')
        for state in states_table:
            engine.execute('insert into states values (%s, %s)', state)

        feat_sql = '\n'.join(
            [', prefix_entity_id_1y_%s_max int' % f for f in feat_list])
        engine.execute('''create table prefix_aggregation (
                entity_id int
                , as_of_date date
                %s
                )''' % feat_sql)
        ins_sql = 'insert into prefix_aggregation values (%s, %s'+\
            (', %s' * len(feat_list))+')'
        for rec in feat_table:
            engine.execute(ins_sql, rec)

        for imp in available_imputations.keys():
            # skip error imputation
            if imp == 'error':
                continue

            for coltype in ['aggregate', 'categorical']:
                # only consider
                if not imputation_values[imp][coltype]['avail']:
                    continue

                impargs = imputation_values[imp][coltype]['kwargs']
                aggs = [
                    Aggregate(feat, ['max'], {
                        'coltype': coltype,
                        'all': dict(type=imp, **impargs)
                    }) for feat in feat_list
                ]
                st = SpacetimeAggregation(
                    aggregates=aggs,
                    from_obj='prefix_events',
                    prefix='prefix',
                    groups=['entity_id'],
                    intervals=['1y'],
                    dates=['2016-01-01', '2016-02-03', '2016-03-14'],
                    state_table='states',
                    state_group='entity_id',
                    date_column='as_of_date',
                    input_min_date='2000-01-01',
                    output_date_column='as_of_date')

                conn = engine.connect()

                trans = conn.begin()

                # excute query to find columns with null values and create lists of columns
                # that do and do not need imputation when creating the imputation table
                res = conn.execute(st.find_nulls())
                null_counts = list(zip(res.keys(), res.fetchone()))
                impute_cols = [col for col, val in null_counts if val > 0]
                nonimpute_cols = [col for col, val in null_counts if val == 0]

                # sql to drop and create the imputation table
                drop_imp = st.get_drop(imputed=True)
                create_imp = st.get_impute_create(
                    impute_cols=impute_cols, nonimpute_cols=nonimpute_cols)

                # create the imputation table
                conn.execute(drop_imp)
                conn.execute(create_imp)

                trans.commit()

                # check the results
                df = pd.read_sql('SELECT * FROM prefix_aggregation_imputed',
                                 engine)

                # we should have a record for every entity/date combo
                assert df.shape[0] == len(states_table)

                for feat in feat_list:
                    # all of the input columns should be in the result and be null-free
                    assert 'prefix_entity_id_1y_%s_max' % feat in df.columns.values
                    assert df['prefix_entity_id_1y_%s_max' %
                              feat].isnull().sum() == 0

                    # for non-categoricals, should add an "imputed" column and be non-null
                    # (categoricals are expected to be handled through the null category)
                    # zero_noflag imputation should not generate a flag either
                    if feat in exp_imp_cols and coltype != 'categorical' and imp != 'zero_noflag':
                        assert 'prefix_entity_id_1y_%s_max_imp' % feat in df.columns.values
                        assert df['prefix_entity_id_1y_%s_max_imp' %
                                  feat].isnull().sum() == 0
                    else:
                        # should not generate an imputed column when not needed
                        assert 'prefix_entity_id_1y_%s_max_imp' % feat not in df.columns.values
예제 #6
0
def test_imputation_output(feat_list, exp_imp_cols, feat_table):
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())

        engine.execute("create table states (entity_id int, as_of_date date)")
        for state in states_table:
            engine.execute("insert into states values (%s, %s)", state)

        feat_sql = "\n".join(
            [", prefix_entity_id_1y_%s_max int" % f for f in feat_list])
        engine.execute("""create table prefix_aggregation (
                entity_id int
                , as_of_date date
                %s
                )""" % feat_sql)
        ins_sql = ("insert into prefix_aggregation values (%s, %s" +
                   (", %s" * len(feat_list)) + ")")
        for rec in feat_table:
            engine.execute(ins_sql, rec)

        for imp in available_imputations.keys():
            # skip error imputation
            if imp == "error":
                continue

            for coltype in ["aggregate", "categorical"]:
                # only consider
                if not imputation_values[imp][coltype]["avail"]:
                    continue

                impargs = imputation_values[imp][coltype]["kwargs"]
                aggs = [
                    Aggregate(
                        feat,
                        ["max"],
                        {
                            "coltype": coltype,
                            "all": dict(type=imp, **impargs)
                        },
                    ) for feat in feat_list
                ]
                st = SpacetimeAggregation(
                    aggregates=aggs,
                    from_obj="prefix_events",
                    prefix="prefix",
                    groups=["entity_id"],
                    intervals=["1y"],
                    dates=["2016-01-01", "2016-02-03", "2016-03-14"],
                    state_table="states",
                    state_group="entity_id",
                    date_column="as_of_date",
                    input_min_date="2000-01-01",
                    output_date_column="as_of_date",
                )

                conn = engine.connect()

                trans = conn.begin()

                # excute query to find columns with null values and create lists of columns
                # that do and do not need imputation when creating the imputation table
                res = conn.execute(st.find_nulls())
                null_counts = list(zip(res.keys(), res.fetchone()))
                impute_cols = [col for col, val in null_counts if val > 0]
                nonimpute_cols = [col for col, val in null_counts if val == 0]

                # sql to drop and create the imputation table
                drop_imp = st.get_drop(imputed=True)
                create_imp = st.get_impute_create(
                    impute_cols=impute_cols, nonimpute_cols=nonimpute_cols)

                # create the imputation table
                conn.execute(drop_imp)
                conn.execute(create_imp)

                trans.commit()

                # check the results
                df = pd.read_sql("SELECT * FROM prefix_aggregation_imputed",
                                 engine)

                # we should have a record for every entity/date combo
                assert df.shape[0] == len(states_table)

                for feat in feat_list:
                    # all of the input columns should be in the result and be null-free
                    assert "prefix_entity_id_1y_%s_max" % feat in df.columns.values
                    assert df["prefix_entity_id_1y_%s_max" %
                              feat].isnull().sum() == 0

                    # for non-categoricals, should add an "imputed" column and be non-null
                    # (categoricals are expected to be handled through the null category)
                    # zero_noflag imputation should not generate a flag either
                    if (feat in exp_imp_cols and coltype != "categorical"
                            and imp != "zero_noflag"):
                        assert ("prefix_entity_id_1y_%s_imp" % feat
                                in df.columns.values)
                        assert (df["prefix_entity_id_1y_%s_imp" %
                                   feat].isnull().sum() == 0)
                    else:
                        # should not generate an imputed column when not needed
                        assert ("prefix_entity_id_1y_%s_imp" % feat
                                not in df.columns.values)
예제 #7
0
def test_basic_spacetime():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            "create table events (entity_id int, event_date date, outcome bool)"
        )
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, as_of_date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "2y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column="event_date",
            output_date_column="as_of_date",
        )

        st.execute(engine.connect())

        r = engine.execute(
            "select * from events_entity_id order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[0]["entity_id"] == 1
        assert rows[0]["as_of_date"] == date(2015, 1, 1)
        assert rows[0]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[1]["entity_id"] == 1
        assert rows[1]["as_of_date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[1]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_2y_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[2]["entity_id"] == 2
        assert rows[2]["as_of_date"] == date(2015, 1, 1)
        assert rows[2]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[2]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[2]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_2y_outcome::int_avg"] == 0.5
        assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
        assert rows[3]["entity_id"] == 2
        assert rows[3]["as_of_date"] == date(2016, 1, 1)
        assert rows[3]["events_entity_id_1y_outcome::int_sum"] is None
        assert rows[3]["events_entity_id_1y_outcome::int_avg"] is None
        assert rows[3]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[3]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5

        assert rows[4]["entity_id"] == 3
        assert rows[4]["as_of_date"] == date(2015, 1, 1)
        assert rows[4]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[4]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[5]["entity_id"] == 3
        assert rows[5]["as_of_date"] == date(2016, 1, 1)
        assert rows[5]["events_entity_id_1y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_1y_outcome::int_avg"] == 0.5
        assert rows[5]["events_entity_id_2y_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_2y_outcome::int_avg"] == 0.25
        assert rows[5]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[5]["events_entity_id_all_outcome::int_avg"] == 0.25

        assert rows[6]["entity_id"] == 4
        # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data!
        assert rows[6]["as_of_date"] == date(2016, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[6]["events_entity_id_all_outcome::int_avg"] == 0
        assert len(rows) == 7

        # check some imputation results
        r = engine.execute(
            "select * from events_aggregation_imputed order by entity_id, as_of_date"
        )
        rows = [x for x in r]
        assert rows[6]["entity_id"] == 4
        assert rows[6]["as_of_date"] == date(2015, 1, 1)
        assert rows[6]["events_entity_id_1y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_1y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_1y_outcome::int_avg"]), 4) == 0.1667
        )
        assert rows[6]["events_entity_id_1y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_2y_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_2y_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_2y_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_2y_outcome::int_avg_imp"] == 1
        assert rows[6]["events_entity_id_all_outcome::int_sum"] == 3
        assert rows[6]["events_entity_id_all_outcome::int_sum_imp"] == 1
        assert (
            round(float(rows[6]["events_entity_id_all_outcome::int_avg"]), 4) == 0.3333
        )
        assert rows[6]["events_entity_id_all_outcome::int_avg_imp"] == 1
        assert rows[7]["entity_id"] == 4
        assert rows[7]["as_of_date"] == date(2016, 1, 1)
        assert rows[7]["events_entity_id_1y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_1y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_2y_outcome::int_avg_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_sum_imp"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg"] == 0
        assert rows[7]["events_entity_id_all_outcome::int_avg_imp"] == 0
        assert len(rows) == 8
예제 #8
0
def test_input_min_date():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute("create table events (entity_id int, date date, outcome bool)")
        for event in events_data:
            engine.execute("insert into events values (%s, %s, %s::bool)", event)

        engine.execute("create table states (entity_id int, date date)")
        for state in state_data:
            engine.execute("insert into states values (%s, %s)", state)

        agg = Aggregate(
            "outcome::int",
            ["sum", "avg"],
            {
                "coltype": "aggregate",
                "avg": {"type": "mean"},
                "sum": {"type": "constant", "value": 3},
                "max": {"type": "zero"},
            },
        )
        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["all"],
            dates=["2016-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2015-11-10",
        )

        st.execute(engine.connect())

        r = engine.execute("select * from events_entity_id order by entity_id")
        rows = [x for x in r]

        assert rows[0]["entity_id"] == 1
        assert rows[0]["date"] == date(2016, 1, 1)
        assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
        assert rows[0]["events_entity_id_all_outcome::int_avg"] == 1
        assert rows[1]["entity_id"] == 4
        assert rows[1]["date"] == date(2016, 1, 1)
        assert rows[1]["events_entity_id_all_outcome::int_sum"] == 0
        assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0

        assert len(rows) == 2

        st = SpacetimeAggregation(
            aggregates=[agg],
            from_obj="events",
            groups=["entity_id"],
            intervals=["1y", "all"],
            dates=["2016-01-01", "2015-01-01"],
            state_table="states",
            state_group="entity_id",
            date_column='"date"',
            input_min_date="2014-11-10",
        )
        with pytest.raises(ValueError):
            st.validate(engine.connect())
        with pytest.raises(ValueError):
            st.execute(engine.connect())
예제 #9
0
def test_join_with_cohort_table(db_engine):
    # if we specify joining with the cohort table
    # only entity_id/date pairs in the cohort table should show up
    db_engine.execute("create table events (entity_id int, date date, outcome bool)")
    for event in events_data:
        db_engine.execute("insert into events values (%s, %s, %s::bool)", event)

    db_engine.execute("create table cohort (entity_id int, date date)")

    # use the states list from above except only include entities 1 and 2 in the cohort
    smaller_cohort = sorted(
        product(
            set([l[0] for l in events_data if l[0] == 1 or l[0] == 2]),
            set([l[1] for l in events_data] + [date(2016, 1, 1)]),
        )
    )
    for state in smaller_cohort:
        db_engine.execute("insert into cohort values (%s, %s)", state)

    # create our test aggregation with the important 'join_with_cohort_table' flag
    agg = Aggregate(
        "outcome::int",
        ["sum", "avg"],
        {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"},
        },
    )
    st = SpacetimeAggregation(
        aggregates=[agg],
        from_obj="events",
        groups=["entity_id"],
        intervals=["all"],
        dates=["2016-01-01", "2015-01-01"],
        state_table="cohort",
        state_group="entity_id",
        date_column='"date"',
        join_with_cohort_table=True,
    )

    st.execute(db_engine.connect())

    r = db_engine.execute("select * from events_entity_id order by entity_id, date")
    rows = [x for x in r]

    # these rows should be similar to the rows in the basic spacetime test,
    # except only the rows for entities 1 and 2 are present
    assert len(rows) == 4

    assert rows[0]["entity_id"] == 1
    assert rows[0]["date"] == date(2015, 1, 1)
    assert rows[0]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[0]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[1]["entity_id"] == 1
    assert rows[1]["date"] == date(2016, 1, 1)
    assert rows[1]["events_entity_id_all_outcome::int_sum"] == 2
    assert rows[1]["events_entity_id_all_outcome::int_avg"] == 0.5

    assert rows[2]["entity_id"] == 2
    assert rows[2]["date"] == date(2015, 1, 1)
    assert rows[2]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[2]["events_entity_id_all_outcome::int_avg"] == 0.5
    assert rows[3]["entity_id"] == 2
    assert rows[3]["date"] == date(2016, 1, 1)
    assert rows[3]["events_entity_id_all_outcome::int_sum"] == 1
    assert rows[3]["events_entity_id_all_outcome::int_avg"] == 0.5
예제 #10
0
def test_generate_table_tasks():
    aggregations = [
        SpacetimeAggregation(prefix='prefix1',
                             aggregates=[
                                 Categorical(
                                     col='cat_one',
                                     function='sum',
                                     choices=['good', 'bad', 'inbetween'],
                                     impute_rules={
                                         'coltype': 'categorical',
                                         'all': {
                                             'type': 'zero'
                                         }
                                     })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data'),
        SpacetimeAggregation(prefix='prefix2',
                             aggregates=[
                                 Aggregate(quantity='quantity_one',
                                           function='count',
                                           impute_rules={
                                               'coltype': 'aggregate',
                                               'all': {
                                                   'type': 'zero'
                                               }
                                           })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data')
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'

        table_tasks = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name
        ).generate_all_table_tasks(aggregations, task_type='aggregation')
        for table_name, task in table_tasks.items():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)

        # build the aggregation tables to check the imputation tasks
        FeatureGenerator(
            db_engine=engine,
            features_schema_name=features_schema_name).process_table_tasks(
                table_tasks)

        table_tasks = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name
        ).generate_all_table_tasks(aggregations, task_type='imputation')
        for table_name, task in table_tasks.items():
            assert 'DROP TABLE' in task['prepare'][0]
            assert 'CREATE TABLE' in str(task['prepare'][1])
            assert 'CREATE INDEX' in task['finalize'][0]
            assert isinstance(task['inserts'], list)
예제 #11
0
def test_index_column_lookup():
    aggregations = [
        SpacetimeAggregation(prefix='prefix1',
                             aggregates=[
                                 Categorical(
                                     col='cat_one',
                                     function='sum',
                                     choices=['good', 'bad', 'inbetween'],
                                     impute_rules={
                                         'coltype': 'categorical',
                                         'all': {
                                             'type': 'zero'
                                         }
                                     })
                             ],
                             groups=['entity_id'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data'),
        SpacetimeAggregation(prefix='prefix2',
                             aggregates=[
                                 Aggregate(quantity='quantity_one',
                                           function='count',
                                           impute_rules={
                                               'coltype': 'aggregate',
                                               'all': {
                                                   'type': 'zero'
                                               }
                                           })
                             ],
                             groups=['entity_id', 'zip_code'],
                             intervals=['all'],
                             date_column='knowledge_date',
                             output_date_column='as_of_date',
                             dates=['2013-09-30', '2014-09-30'],
                             state_table='states',
                             state_group='entity_id',
                             schema='features',
                             from_obj='data')
    ]
    with testing.postgresql.Postgresql() as postgresql:
        engine = create_engine(postgresql.url())
        setup_db(engine)

        features_schema_name = 'features'
        feature_generator = FeatureGenerator(
            db_engine=engine, features_schema_name=features_schema_name)
        lookup = feature_generator.index_column_lookup(aggregations)
        assert lookup == {
            'prefix1_aggregation_imputed': [
                'as_of_date',
                'entity_id',
            ],
            'prefix2_aggregation_imputed':
            ['as_of_date', 'entity_id', 'zip_code']
        }
예제 #12
0
def test_basic_spacetime():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            'create table events (entity_id int, event_date date, outcome bool)'
        )
        for event in events_data:
            engine.execute(
                'insert into events values (%s, %s, %s::bool)',
                event
            )

        engine.execute(
            'create table states (entity_id int, as_of_date date)'
        )
        for state in state_data:
            engine.execute(
                'insert into states values (%s, %s)',
                state
            )

        agg = Aggregate('outcome::int', ['sum','avg'], {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"}
        })
        st = SpacetimeAggregation(
                aggregates = [agg],
                from_obj = 'events',
                groups = ['entity_id'],
                intervals = ['1y', '2y', 'all'],
                dates = ['2016-01-01', '2015-01-01'],
                state_table = 'states',
                state_group = 'entity_id',
                date_column = 'event_date',
                output_date_column = 'as_of_date'
            )

        st.execute(engine.connect())

        r = engine.execute('select * from events_entity_id order by entity_id, as_of_date')
        rows = [x for x in r]
        assert rows[0]['entity_id'] == 1
        assert rows[0]['as_of_date'] == date(2015, 1, 1)
        assert rows[0]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[0]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_all_outcome::int_avg'] == 0.5
        assert rows[1]['entity_id'] == 1
        assert rows[1]['as_of_date'] == date(2016, 1, 1)
        assert rows[1]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[1]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[1]['events_entity_id_2y_outcome::int_sum'] == 2
        assert rows[1]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[1]['events_entity_id_all_outcome::int_sum'] == 2
        assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0.5

        assert rows[2]['entity_id'] == 2
        assert rows[2]['as_of_date'] == date(2015, 1, 1)
        assert rows[2]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[2]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[2]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[2]['events_entity_id_2y_outcome::int_avg'] == 0.5
        assert rows[2]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[2]['events_entity_id_all_outcome::int_avg'] == 0.5
        assert rows[3]['entity_id'] == 2
        assert rows[3]['as_of_date'] == date(2016, 1, 1)
        assert rows[3]['events_entity_id_1y_outcome::int_sum'] == None
        assert rows[3]['events_entity_id_1y_outcome::int_avg'] == None
        assert rows[3]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[3]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[3]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[3]['events_entity_id_all_outcome::int_avg'] == 0.5

        assert rows[4]['entity_id'] == 3
        assert rows[4]['as_of_date'] == date(2015, 1, 1)
        assert rows[4]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[4]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[4]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[4]['events_entity_id_all_outcome::int_avg'] == 0
        assert rows[5]['entity_id'] == 3
        assert rows[5]['as_of_date'] == date(2016, 1, 1)
        assert rows[5]['events_entity_id_1y_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_1y_outcome::int_avg'] == 0.5
        assert rows[5]['events_entity_id_2y_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_2y_outcome::int_avg'] == 0.25
        assert rows[5]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[5]['events_entity_id_all_outcome::int_avg'] == 0.25

        assert rows[6]['entity_id'] == 4
        # rows[6]['date'] == date(2015, 1, 1) is skipped due to no data!
        assert rows[6]['as_of_date'] == date(2016, 1, 1)
        assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[6]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[6]['events_entity_id_all_outcome::int_avg'] == 0
        assert len(rows) == 7

        # check some imputation results
        r = engine.execute('select * from events_aggregation_imputed order by entity_id, as_of_date')
        rows = [x for x in r]
        assert rows[6]['entity_id'] == 4
        assert rows[6]['as_of_date'] == date(2015, 1, 1)
        assert rows[6]['events_entity_id_1y_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_1y_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_1y_outcome::int_avg']), 4) == 0.1667
        assert rows[6]['events_entity_id_1y_outcome::int_avg_imp'] == 1
        assert rows[6]['events_entity_id_2y_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_2y_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_2y_outcome::int_avg']), 4) == 0.3333
        assert rows[6]['events_entity_id_2y_outcome::int_avg_imp'] == 1
        assert rows[6]['events_entity_id_all_outcome::int_sum'] == 3
        assert rows[6]['events_entity_id_all_outcome::int_sum_imp'] == 1
        assert round(float(rows[6]['events_entity_id_all_outcome::int_avg']), 4) == 0.3333
        assert rows[6]['events_entity_id_all_outcome::int_avg_imp'] == 1
        assert rows[7]['entity_id'] == 4
        assert rows[7]['as_of_date'] == date(2016, 1, 1)
        assert rows[7]['events_entity_id_1y_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_1y_outcome::int_avg_imp'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_2y_outcome::int_avg_imp'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_sum_imp'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_avg'] == 0
        assert rows[7]['events_entity_id_all_outcome::int_avg_imp'] == 0
        assert len(rows) == 8
예제 #13
0
def test_input_min_date():
    with testing.postgresql.Postgresql() as psql:
        engine = sqlalchemy.create_engine(psql.url())
        engine.execute(
            'create table events (entity_id int, date date, outcome bool)'
        )
        for event in events_data:
            engine.execute(
                'insert into events values (%s, %s, %s::bool)',
                event
            )

        engine.execute(
            'create table states (entity_id int, date date)'
        )
        for state in state_data:
            engine.execute(
                'insert into states values (%s, %s)',
                state
            )

        agg = Aggregate('outcome::int', ['sum','avg'], {
            "coltype": "aggregate",
            "avg": {"type": "mean"},
            "sum": {"type": "constant", "value": 3},
            "max": {"type": "zero"}
        })
        st = SpacetimeAggregation(
            aggregates = [agg],
            from_obj = 'events',
            groups = ['entity_id'],
            intervals = ['all'],
            dates = ['2016-01-01'],
            state_table = 'states',
            state_group = 'entity_id',
            date_column = '"date"',
            input_min_date = '2015-11-10'
        )

        st.execute(engine.connect())

        r = engine.execute('select * from events_entity_id order by entity_id')
        rows = [x for x in r]

        assert rows[0]['entity_id'] == 1
        assert rows[0]['date'] == date(2016, 1, 1)
        assert rows[0]['events_entity_id_all_outcome::int_sum'] == 1
        assert rows[0]['events_entity_id_all_outcome::int_avg'] == 1
        assert rows[1]['entity_id'] == 4
        assert rows[1]['date'] == date(2016, 1, 1)
        assert rows[1]['events_entity_id_all_outcome::int_sum'] == 0
        assert rows[1]['events_entity_id_all_outcome::int_avg'] == 0

        assert len(rows) == 2

        st = SpacetimeAggregation(
            aggregates = [agg],
            from_obj = 'events',
            groups = ['entity_id'],
            intervals = ['1y', 'all'],
            dates = ['2016-01-01', '2015-01-01'],
            state_table = 'states',
            state_group = 'entity_id',
            date_column = '"date"',
            input_min_date = '2014-11-10'
        )
        with pytest.raises(ValueError):
            st.validate(engine.connect())
        with pytest.raises(ValueError):
            st.execute(engine.connect())