def test_base_features_in_list(es): value = ft.IdentityFeature(es['log']['value']) max_feature = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) features = [max_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name(), value.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_base_features_in_list(es): value = ft.IdentityFeature(es['log']['value']) max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feat.unique_name(), value.unique_name()], 'feature_definitions': { max_feat.unique_name(): max_feat.to_dictionary(), value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat, value] assert expected == deserializer.to_list()
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es['log']['id']) td = pd.Timedelta(3, "M") count_feature = ft.AggregationFeature(value, es['customers'], ft.primitives.Count, use_previous=td) dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list()
def test_base_features_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name(), value.unique_name()], "feature_definitions": { max_feat.unique_name(): max_feat.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) expected = [max_feat, value] assert expected == deserializer.to_list()
def test_base_features_in_list(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feature = ft.AggregationFeature(value, "sessions", ft.primitives.Max) features = [max_feature, value] serializer = FeaturesSerializer(features) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feature.unique_name(), value.unique_name()], "feature_definitions": { max_feature.unique_name(): max_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } _compare_feature_dicts(expected, serializer.to_dict())
def test_multi_output_features(es): product_id = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(product_id, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, product_id] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, ) ) features.append(tc[i]) serializer = FeaturesSerializer(features) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } expected["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } expected["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" expected["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)
def test_multioutput_feature(es): value = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(value, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, value] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, )) features.append(tc[i]) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } dictionary["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } dictionary["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" dictionary["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" deserializer = FeaturesDeserializer(dictionary).to_list() for i in range(len(features)): assert features[i].unique_name() == deserializer[i].unique_name()
def test_diamond_entityset(diamond_es): es = diamond_es amount = ft.IdentityFeature(es['transactions']['amount']) path = backward_path(es, ['regions', 'customers', 'transactions']) through_customers = ft.AggregationFeature(amount, es['regions'], primitive=ft.primitives.Sum, relationship_path=path) path = backward_path(es, ['regions', 'stores', 'transactions']) through_stores = ft.AggregationFeature(amount, es['regions'], primitive=ft.primitives.Sum, relationship_path=path) feature_set = FeatureSet([through_customers, through_stores]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) assert (df['SUM(stores.transactions.amount)'] == [94, 261, 128]).all() assert (df['SUM(customers.transactions.amount)'] == [72, 411, 0]).all()
def test_to_dictionary_where(es): actual = ft.Feature(es['log'].ww['value'], parent_dataframe_name='sessions', where=ft.IdentityFeature(es['log'].ww['value']) == 2, primitive=Sum).to_dictionary() expected = { 'type': 'AggregationFeature', 'dependencies': ['log: value', 'log: value = 2'], 'arguments': {'name': None, 'base_features': ['log: value'], 'relationship_path': [{'parent_dataframe_name': 'sessions', 'child_dataframe_name': 'log', 'parent_column_name': 'id', 'child_column_name': 'session_id'}], 'primitive': {'type': 'Sum', 'module': 'featuretools.primitives.standard.aggregation_primitives', 'arguments': {}}, 'where': 'log: value = 2', 'use_previous': None} } assert expected == actual
def test_base_features_not_in_list(es): value = ft.IdentityFeature(es['log'].ww['value']) value_x2 = ft.TransformFeature( value, ft.primitives.MultiplyNumericScalar(value=2)) max_feature = ft.AggregationFeature(value_x2, 'sessions', ft.primitives.Max) features = [max_feature] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feature.unique_name()], 'feature_definitions': { max_feature.unique_name(): max_feature.to_dictionary(), value_x2.unique_name(): value_x2.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es): value = ft.IdentityFeature(es['log']['value']) max_feat = ft.AggregationFeature(value, es['sessions'], ft.primitives.Max) max_dict = max_feat.to_dictionary() max_dict['arguments']['primitive']['module'] = 'fake.module' dictionary = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [max_feat.unique_name(), value.unique_name()], 'feature_definitions': { max_feat.unique_name(): max_dict, value.unique_name(): value.to_dictionary(), } } deserializer = FeaturesDeserializer(dictionary) with pytest.raises(RuntimeError) as excinfo: deserializer.to_list() error_text = 'Primitive "Max" in module "fake.module" not found' assert error_text == str(excinfo.value)
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es["log"].ww["id"]) td = pd.Timedelta(12, "W") count_feature = ft.AggregationFeature(value, "customers", ft.primitives.Count, use_previous=td) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [count_feature.unique_name(), value.unique_name()], "feature_definitions": { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) expected = [count_feature, value] assert expected == deserializer.to_list()
def test_feature_use_previous_pd_timedelta(es): value = ft.IdentityFeature(es['log'].ww['id']) td = pd.Timedelta(12, "W") count_feature = ft.AggregationFeature(value, 'customers', ft.primitives.Count, use_previous=td) features = [count_feature, value] serializer = FeaturesSerializer(features) expected = { 'ft_version': ft.__version__, 'schema_version': SCHEMA_VERSION, 'entityset': es.to_dictionary(), 'feature_list': [count_feature.unique_name(), value.unique_name()], 'feature_definitions': { count_feature.unique_name(): count_feature.to_dictionary(), value.unique_name(): value.to_dictionary(), } } _compare_feature_dicts(expected, serializer.to_dict())
def test_unknown_primitive_module(es): value = ft.IdentityFeature(es["log"].ww["value"]) max_feat = ft.AggregationFeature(value, "sessions", ft.primitives.Max) max_dict = max_feat.to_dictionary() max_dict["arguments"]["primitive"]["module"] = "fake.module" dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": [max_feat.unique_name(), value.unique_name()], "feature_definitions": { max_feat.unique_name(): max_dict, value.unique_name(): value.to_dictionary(), }, } deserializer = FeaturesDeserializer(dictionary) with pytest.raises(RuntimeError) as excinfo: deserializer.to_list() error_text = 'Primitive "Max" in module "fake.module" not found' assert error_text == str(excinfo.value)
def test_diamond_entityset(diamond_es): es = diamond_es amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature(amount, "regions", primitive=ft.primitives.Sum, relationship_path=path) path = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature(amount, "regions", primitive=ft.primitives.Sum, relationship_path=path) feature_set = FeatureSet([through_customers, through_stores]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 4, 8), feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) df = to_pandas(df, index="id", sort_index=True) assert (df["SUM(stores.transactions.amount)"] == [94, 261, 128]).all() assert (df["SUM(customers.transactions.amount)"] == [72, 411, 0]).all()
def test_feature_trie_without_needs_full_dataframe(diamond_es): es = diamond_es country_name = ft.IdentityFeature(es["countries"].ww["name"]) direct_name = ft.DirectFeature(country_name, "regions") amount = ft.IdentityFeature(es["transactions"].ww["amount"]) path_through_customers = backward_path(es, ["regions", "customers", "transactions"]) through_customers = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_customers, ) path_through_stores = backward_path(es, ["regions", "stores", "transactions"]) through_stores = ft.AggregationFeature( amount, "regions", primitive=ft.primitives.Mean, relationship_path=path_through_stores, ) customers_to_transactions = backward_path(es, ["customers", "transactions"]) customers_mean = ft.AggregationFeature( amount, "customers", primitive=ft.primitives.Mean, relationship_path=customers_to_transactions, ) negation = ft.TransformFeature(customers_mean, ft.primitives.Negate) regions_to_customers = backward_path(es, ["regions", "customers"]) mean_of_mean = ft.AggregationFeature( negation, "regions", primitive=ft.primitives.Mean, relationship_path=regions_to_customers, ) features = [direct_name, through_customers, through_stores, mean_of_mean] feature_set = FeatureSet(features) trie = feature_set.feature_trie assert trie.value == (False, set(), {f.unique_name() for f in features}) assert trie.get_node(direct_name.relationship_path).value == ( False, set(), {country_name.unique_name()}, ) assert trie.get_node(regions_to_customers).value == ( False, set(), {negation.unique_name(), customers_mean.unique_name()}, ) regions_to_stores = backward_path(es, ["regions", "stores"]) assert trie.get_node(regions_to_stores).value == (False, set(), set()) assert trie.get_node(path_through_customers).value == ( False, set(), {amount.unique_name()}, ) assert trie.get_node(path_through_stores).value == ( False, set(), {amount.unique_name()}, )
def test_relationship_path(es): value = ft.IdentityFeature(es['log']['value']) assert value.relationship_path == []
def test_relationship_path(es): value = ft.IdentityFeature(es["log"].ww["value"]) assert len(value.relationship_path) == 0
def test_cum_sum_numpy_group_on_nan(pd_es): class CumSumNumpy(TransformPrimitive): """Returns the cumulative sum after grouping""" name = "cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) uses_full_dataframe = True def get_function(self): def cum_sum(values): return values.cumsum().values return cum_sum log_value_feat = ft.IdentityFeature(pd_es["log"].ww["value"]) pd_es["log"]["product_id"] = ( ["coke zero"] * 3 + ["car"] * 2 + ["toothpaste"] * 3 + ["brown bag"] * 2 + ["shoes"] + [np.nan] * 4 + ["coke_zero"] * 2 ) pd_es["log"]["value"][16] = 10 cum_sum = ft.Feature( log_value_feat, groupby=ft.IdentityFeature(pd_es["log"].ww["product_id"]), primitive=CumSumNumpy, ) assert cum_sum.get_name() == "CUM_SUM(value) by product_id" features = [cum_sum] df = ft.calculate_feature_matrix( entityset=pd_es, features=features, instance_ids=range(17) ) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 17 cum_sum_values = [ 0, 5, 15, 15, 35, 0, 1, 3, 3, 3, 0, np.nan, np.nan, np.nan, np.nan, np.nan, 10, ] assert len(cvalues) == len(cum_sum_values) for i, v in enumerate(cum_sum_values): if np.isnan(v): assert np.isnan(cvalues[i]) else: assert v == cvalues[i]
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = "MyFeature" original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = ( [new_name] if len(original_names) == 1 else [new_name + "[{}]".format(i) for i in range(len(original_names))] ) check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es["log"].ww["value"]) assert identity_original.get_name() == "value" value = ft.IdentityFeature(es["log"].ww["value"]) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, "customers", primitive) assert agg_original.get_name() == "MAX(log.value)" direct_original = ft.DirectFeature( ft.IdentityFeature(es["customers"].ww["age"]), "sessions" ) assert direct_original.get_name() == "customers.age" primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == "value * 2" zipcode = ft.IdentityFeature(es["log"].ww["zipcode"]) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode ) assert groupby_original.get_name() == "CUM_SUM(value) by zipcode" multioutput_original = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) assert multioutput_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)" featureslice_original = ft.feature_base.FeatureOutputSlice(multioutput_original, 0) assert featureslice_original.get_name() == "N_MOST_COMMON(log.product_id, n=2)[0]" feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original, ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_relationship_path(es): value = ft.IdentityFeature(es['log'].ww['value']) assert len(value.relationship_path) == 0