def test_categorical_cast(): cat = Categorical("c", ['A','B','C'], "sum", {}, coltype="SMALLINT") assert list(map(str, cat.get_columns())) == [ "sum((c = 'A')::INT)::SMALLINT", "sum((c = 'B')::INT)::SMALLINT", "sum((c = 'C')::INT)::SMALLINT" ]
def test_categorical_nones(): d1 = Categorical('col', { 'vala': 'a', 'valb': 'b', 'valc': 'c', '_NULL': None }, [], {}).quantities d2 = Compare('col', '=', { 'vala': 'a', 'valb': 'b', 'valc': 'c' }, [], {}, op_in_name=False, include_null=True).quantities assert d1 == d2 d3 = Categorical('col', ['a', 'b', 'c', None], [], {}).quantities assert sorted(d1.values()) == sorted(d2.values())
def test_categorical_same_as_compare(): d1 = Categorical("col", { "vala": "a", "valb": "b", "valc": "c" }, [], {}).quantities d2 = Compare("col", "=", { "vala": "a", "valb": "b", "valc": "c" }, [], {}).quantities assert sorted(d1.values()) == sorted(d2.values()) d3 = Categorical("col", { "vala": "a", "valb": "b", "valc": "c" }, [], {}, op_in_name=True).quantities assert d2 == d3
def test_categorical_same_as_compare(): d1 = Categorical('col', { 'vala': 'a', 'valb': 'b', 'valc': 'c' }, [], {}).quantities d2 = Compare('col', '=', { 'vala': 'a', 'valb': 'b', 'valc': 'c' }, [], {}).quantities assert sorted(d1.values()) == sorted(d2.values()) d3 = Categorical('col', { 'vala': 'a', 'valb': 'b', 'valc': 'c' }, [], {}, op_in_name=True).quantities assert d2 == d3
def test_index_column_lookup(test_engine): aggregations = [ SpacetimeAggregation( prefix="prefix1", aggregates=[ Categorical( col="cat_one", function="sum", choices=["good", "bad", "inbetween"], impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), SpacetimeAggregation( prefix="prefix2", aggregates=[ Aggregate( quantity="quantity_one", function="count", impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, ) ], groups=["entity_id", "zip_code"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), ] features_schema_name = "features" feature_generator = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ) lookup = feature_generator.index_column_lookup(aggregations) assert lookup == { "prefix1_aggregation_imputed": ["as_of_date", "entity_id"], "prefix2_aggregation_imputed": ["as_of_date", "entity_id", "zip_code"], }
def _build_categoricals(self, categorical_config, impute_rules): # TODO: only include null flag where necessary return [ Categorical(col=categorical['column'], choices=self._build_choices(categorical), function=categorical['metrics'], impute_rules=dict(impute_rules, coltype='categorical', **categorical.get('imputation', {})), include_null=True) for categorical in categorical_config ]
def _build_categoricals(self, categorical_config, impute_rules): # TODO: only include null flag where necessary return [ Categorical( col=categorical["column"], choices=self._build_choices(categorical), function=categorical["metrics"], impute_rules=dict(impute_rules, coltype="categorical", **categorical.get("imputation", {})), include_null=True, ) for categorical in categorical_config ]
def test_categorical_nones(): d1 = Categorical("col", { "vala": "a", "valb": "b", "valc": "c", "_NULL": None }, [], {}).quantities d2 = Compare( "col", "=", { "vala": "a", "valb": "b", "valc": "c" }, [], {}, op_in_name=False, include_null=True, ).quantities assert d1 == d2 d3 = Categorical("col", ["a", "b", "c", None], [], {}).quantities assert sorted(d1.values()) == sorted(d3.values())
def test_generate_table_tasks(test_engine): test_engine.execute('create schema features') aggregations = [ SpacetimeAggregation( prefix="prefix1", aggregates=[ Categorical( col="cat_one", function="sum", choices=["good", "bad", "inbetween"], impute_rules={"coltype": "categorical", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), SpacetimeAggregation( prefix="prefix2", aggregates=[ Aggregate( quantity="quantity_one", function="count", impute_rules={"coltype": "aggregate", "all": {"type": "zero"}}, ) ], groups=["entity_id"], intervals=["all"], date_column="knowledge_date", output_date_column="as_of_date", dates=["2013-09-30", "2014-09-30"], state_table="states", state_group="entity_id", schema="features", from_obj="data", ), ] features_schema_name = "features" table_tasks = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).generate_all_table_tasks(aggregations, task_type="aggregation") for table_name, task in table_tasks.items(): assert "DROP TABLE" in task["prepare"][0] assert "CREATE TABLE" in str(task["prepare"][1]) assert "CREATE INDEX" in task["finalize"][0] assert isinstance(task["inserts"], list) # build the aggregation tables to check the imputation tasks FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).process_table_tasks(table_tasks) table_tasks = FeatureGenerator( db_engine=test_engine, features_schema_name=features_schema_name, ).generate_all_table_tasks(aggregations, task_type="imputation") for table_name, task in table_tasks.items(): assert "DROP TABLE" in task["prepare"][0] assert "CREATE TABLE" in str(task["prepare"][1]) assert "CREATE INDEX" in task["finalize"][0] assert isinstance(task["inserts"], list)
def test_generate_table_tasks(): aggregations = [ SpacetimeAggregation(prefix='prefix1', aggregates=[ Categorical( col='cat_one', function='sum', choices=['good', 'bad', 'inbetween'], impute_rules={ 'coltype': 'categorical', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data'), SpacetimeAggregation(prefix='prefix2', aggregates=[ Aggregate(quantity='quantity_one', function='count', impute_rules={ 'coltype': 'aggregate', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data') ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks(aggregations, task_type='aggregation') for table_name, task in table_tasks.items(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list) # build the aggregation tables to check the imputation tasks FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name).process_table_tasks( table_tasks) table_tasks = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name ).generate_all_table_tasks(aggregations, task_type='imputation') for table_name, task in table_tasks.items(): assert 'DROP TABLE' in task['prepare'][0] assert 'CREATE TABLE' in str(task['prepare'][1]) assert 'CREATE INDEX' in task['finalize'][0] assert isinstance(task['inserts'], list)
def test_index_column_lookup(): aggregations = [ SpacetimeAggregation(prefix='prefix1', aggregates=[ Categorical( col='cat_one', function='sum', choices=['good', 'bad', 'inbetween'], impute_rules={ 'coltype': 'categorical', 'all': { 'type': 'zero' } }) ], groups=['entity_id'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data'), SpacetimeAggregation(prefix='prefix2', aggregates=[ Aggregate(quantity='quantity_one', function='count', impute_rules={ 'coltype': 'aggregate', 'all': { 'type': 'zero' } }) ], groups=['entity_id', 'zip_code'], intervals=['all'], date_column='knowledge_date', output_date_column='as_of_date', dates=['2013-09-30', '2014-09-30'], state_table='states', state_group='entity_id', schema='features', from_obj='data') ] with testing.postgresql.Postgresql() as postgresql: engine = create_engine(postgresql.url()) setup_db(engine) features_schema_name = 'features' feature_generator = FeatureGenerator( db_engine=engine, features_schema_name=features_schema_name) lookup = feature_generator.index_column_lookup(aggregations) assert lookup == { 'prefix1_aggregation_imputed': [ 'as_of_date', 'entity_id', ], 'prefix2_aggregation_imputed': ['as_of_date', 'entity_id', 'zip_code'] }