def test_copy(games_es): home_games = next(r for r in games_es.relationships if r.child_variable.id == 'home_team_id') path = RelationshipPath([(False, home_games)]) feat = ft.AggregationFeature(games_es['games']['home_team_score'], games_es['teams'], relationship_path=path, primitive=ft.primitives.Mean) copied = feat.copy() assert copied.entity == feat.entity assert copied.base_features == feat.base_features assert copied.relationship_path == feat.relationship_path assert copied.primitive == feat.primitive
def test_diamond_entityset(diamond_es): es = diamond_es amount = ft.IdentityFeature(es['transactions']['amount']) path = backward_path(es, ['regions', 'customers', 'transactions']) through_customers = ft.AggregationFeature(amount, es['regions'], primitive=ft.primitives.Sum, relationship_path=path) path = backward_path(es, ['regions', 'stores', 'transactions']) through_stores = ft.AggregationFeature(amount, es['regions'], primitive=ft.primitives.Sum, relationship_path=path) feature_set = FeatureSet([through_customers, through_stores]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) assert (df['SUM(stores.transactions.amount)'] == [94, 261, 128]).all() assert (df['SUM(customers.transactions.amount)'] == [72, 411, 0]).all()
def test_feature_use_previous_pd_dateoffset(es): value = ft.IdentityFeature(es['log']['id']) do = pd.DateOffset(months=3) count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=do) features = [count_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict()) value = ft.IdentityFeature(es['log']['id']) do = pd.DateOffset(months=3, days=2, minutes=30) count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=do) features = [count_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_name_with_multiple_possible_paths(diamond_es): transaction_relationships = diamond_es.get_forward_relationships( 'transactions') transaction_to_customer = next(r for r in transaction_relationships if r.parent_entity.id == 'customers') customer_to_region = diamond_es.get_forward_relationships('customers')[0] # Does not raise if path specified. feat = ft.AggregationFeature( diamond_es['transactions']['amount'], diamond_es['regions'], ft.primitives.Mean, relationship_path=[customer_to_region, transaction_to_customer]) assert feat.get_name() == "MEAN(customers.transactions.amount)"
def test_copy(games_es): home_games = next(r for r in games_es.relationships if r._child_column_name == 'home_team_id') path = RelationshipPath([(False, home_games)]) feat = ft.AggregationFeature(ft.IdentityFeature( games_es['games'].ww['home_team_score']), 'teams', relationship_path=path, primitive=ft.primitives.Mean) copied = feat.copy() assert copied.dataframe_name == feat.dataframe_name assert copied.base_features == feat.base_features assert copied.relationship_path == feat.relationship_path assert copied.primitive == feat.primitive
def test_diamond_entityset(diamond_es): es = diamond_es amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature(amount, "regions", primitive=ft.primitives.Sum, relationship_path=path) path = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature(amount, "regions", primitive=ft.primitives.Sum, relationship_path=path) feature_set = FeatureSet([through_customers, through_stores]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) df = to_pandas(df, index="id", sort_index=True) assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all() assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all()
def test_base_features_in_list(es): value = ft.IdentityFeature(es['log']['value']) max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feat.unique_name(), value.unique_name()], 'feature_definitions': { max_feat.unique_name(): max_feat.to_dictionary(), value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat, value] assert expected == deserializer.to_list()
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = 'MyFeature' original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = [new_name] if len(original_names) == 1 else [new_name + '[{}]'.format(i) for i in range(len(original_names))] check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es['log']['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, es['customers'], primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature(es['customers']['age'], es['sessions']) assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature(value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' multioutput_original = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) assert multioutput_original.get_name() == 'N_MOST_COMMON(log.product_id, n=2)' featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == 'N_MOST_COMMON(log.product_id, n=2)[0]' feature_type_list = [identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_base_features_in_list(es): value = ft.IdentityFeature(es['log']['value']) max_feature = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) features = [max_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name(), value.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es['log']['id']) td = pd.Timedelta(3, "M") count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=td) dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list()
def test_base_features_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name(), value.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat, value] assert expected == deserializer.to_list()
def test_base_features_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feature = ft.AggregationFeature(value, "sessions", ft.primitives.Max) features = [max_feature, value] serializer = FeaturesSerializer(features) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feature.unique_name(), value.unique_name()], "feature_definitions": { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } _compare_feature_dicts(expected, serializer.to_dict())
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): renamed = original.rename('MyFeature') assert renamed.get_name() == 'MyFeature' serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] assert deserialized.get_name() == 'MyFeature' identity_original = ft.IdentityFeature(es['log']['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log']['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, es['customers'], primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature(es['customers']['age'], es['sessions']) assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log']['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_unknown_primitive_module(es): value = ft.IdentityFeature(es['log']['value']) max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) max_dict = max_feat.to_dictionary() max_dict['arguments']['primitive']['module'] = 'fake.module' dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feat.unique_name(), value.unique_name()], 'feature_definitions': { max_feat.unique_name(): max_dict, value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) with pytest.raises(RuntimeError) as excinfo: deserializer.to_list() error_text = 'Primitive "Max" in module "fake.module" not found' assert error_text == str(excinfo.value)
def test_base_features_not_in_list(es): value = ft.IdentityFeature(es['log'].ww['value']) value_x2 = ft.TransformFeature( value, ft.primitives.MultiplyNumericScalar(value=2)) max_feature = ft.AggregationFeature(value_x2, 'sessions', ft.primitives.Max) features = [max_feature] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es["log"].ww["id"]) td = pd.Timedelta(12, "W") count_feature = ft.AggregationFeature(value, "customers", ft.primitives.Count, use_previous=td) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [count_feature.unique_name(), value.unique_name()], "feature_definitions": { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list()
def test_where_feature_dependency(es): value = ft.IdentityFeature(es['log'].ww['value']) is_purchased = ft.IdentityFeature(es['log'].ww['purchased']) max_feature = ft.AggregationFeature(value, 'sessions', ft.primitives.Max, where=is_purchased) features = [max_feature] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), is_purchased.unique_name(): is_purchased.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max) max_dict = max_feat.to_dictionary() max_dict["arguments"]["primitive"]["module"] = "fake.module" dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name(), value.unique_name()], "feature_definitions": { max_feat.unique_name(): max_dict, value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) with pytest.raises(RuntimeError) as excinfo: deserializer.to_list() error_text = 'Primitive "Max" in module "fake.module" not found' assert error_text == str(excinfo.value)
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es['log'].ww['id']) td = pd.Timedelta(12, "W") count_feature = ft.AggregationFeature(value, 'customers', ft.primitives.Count, use_previous=td) features = [count_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_feature_trie_without_needs_full_dataframe(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es["countries"].ww["name"]) direct_name = ft.DirectFeature(country_name, "regions") amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path_through_customers = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_customers, ) path_through_stores = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_stores, ) customers_to_transactions = backward_path(es, ["customers", "transactions"]) customers_mean = ft.AggregationFeature( amount, "customers", primitive=ft.primitives.Mean, relationship_path=customers_to_transactions, ) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ["regions", "customers"]) mean_of_mean = ft.AggregationFeature( negation, "regions", primitive=ft.primitives.Mean, relationship_path=regions_to_customers, ) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == ( False, set(), {country_name.unique_name()}, ) assert trie.get_node(regions_to_customers).value == ( False, set(), {negation.unique_name(), customers_mean.unique_name()}, ) regions_to_stores = backward_path(es, ["regions", "stores"]) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == ( False, set(), {amount.unique_name()}, ) assert trie.get_node(path_through_stores).value == ( False, set(), {amount.unique_name()}, )
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)