def test_encode_features_topn(entityset): topn = Feature(entityset['log']['product_id'], parent_entity=entityset['customers'], primitive=NMostCommon(n=3)) features, feature_defs = dfs(entityset=entityset, instance_ids=[0, 1, 2], target_entity="customers", agg_primitives=[NMostCommon(n=3)]) features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert topn.hash() in [feat.hash() for feat in feature_defs_enc] for name in topn.get_feature_names(): assert name in features_enc.columns
def test_topn(entityset, backend): topn = NMostCommon(entityset['log']['product_id'], entityset['customers'], n=2) pandas_backend = backend([topn]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) true_results = [['toothpaste', 'coke zero'], ['coke zero', 'Haribo sugar-free gummy bears'], ['taco clock']] assert (topn.get_name() in df.columns) for i, values in enumerate(df[topn.get_name()].values): assert set(true_results[i]) == set(values)
def test_stacking_multi(pd_es): threecommon = NMostCommon(3) tc = ft.Feature(pd_es['log']['product_id'], parent_entity=pd_es["sessions"], primitive=threecommon) stacked = [] for i in range(3): stacked.append( ft.Feature(tc[i], parent_entity=pd_es['customers'], primitive=NumUnique)) fm = ft.calculate_feature_matrix(stacked, entityset=pd_es, instance_ids=[0, 1, 2]) correct_vals = [[3, 2, 1], [2, 1, 0], [0, 0, 0]] correct_vals1 = [[3, 1, 1], [2, 1, 0], [0, 0, 0]] # either of the above can be correct, and the outcome depends on the sorting of # two values in the initial n most common function, which changes arbitrarily. for i in range(3): f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i cols = fm.columns assert f in cols assert fm[cols[i]].tolist() == correct_vals[i] or fm[ cols[i]].tolist() == correct_vals1[i]
def test_rename_multioutput(es): feat = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) new_name = 'session_test' new_names = ['session_test[0]', 'session_test[1]'] check_rename(feat, new_name, new_names)
def test_stacks_multioutput_features(es): class TestTime(TransformPrimitive): name = "test_time" input_types = [Datetime] return_type = Numeric number_output_features = 6 def get_function(self): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] return test_f feat = ft.dfs(entityset=es, target_entity="customers", agg_primitives=[NumUnique, NMostCommon(n=3)], trans_primitives=[TestTime, Diff], max_depth=4, features_only=True ) for i in range(3): f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.countrycode)[%d])' % i assert feature_with_name(feat, f) for i in range(6): f = 'DIFF(TEST_TIME(date_of_birth)[%d])' % i assert feature_with_name(feat, f)
def test_topn(entityset, backend): topn = NMostCommon(entityset['log']['product_id'], entityset['customers'], n=2) pandas_backend = backend([topn]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) true_results = [ ['toothpaste', 'coke zero'], ['coke zero', 'Haribo sugar-free gummy bears'], ['taco clock'] ] assert (topn.get_name() in df.columns) for i, values in enumerate(df[topn.get_name()].values): assert set(true_results[i]) == set(values)
def test_topn(pd_es): topn = ft.Feature(pd_es['log']['product_id'], parent_entity=pd_es['customers'], primitive=NMostCommon(n=2)) feature_set = FeatureSet([topn]) calculator = FeatureSetCalculator(pd_es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) true_results = pd.DataFrame( [['toothpaste', 'coke zero'], ['coke zero', 'Haribo sugar-free gummy bears'], ['taco clock', np.nan]]) assert ([name in df.columns for name in topn.get_feature_names()]) for i in range(df.shape[0]): true = true_results.loc[i] actual = df.loc[i] if i == 0: # coke zero and toothpase have same number of occurrences assert set(true.values) == set(actual.values) else: for i1, i2 in zip(true, actual): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_initialized_agg_prim(es): ThreeMost = NMostCommon(n=3) dfs_obj = DeepFeatureSynthesis(target_entity_id="sessions", entityset=es, agg_primitives=[ThreeMost], trans_primitives=[]) features = dfs_obj.build_features() assert (feature_with_name(features, "N_MOST_COMMON(log.product_id)"))
def test_encode_features_topn(pd_es): topn = Feature(Feature(pd_es['log'].ww['product_id']), parent_dataframe_name='customers', primitive=NMostCommon(n=3)) features, feature_defs = dfs(entityset=pd_es, instance_ids=[0, 1, 2], target_dataframe_name="customers", agg_primitives=[NMostCommon(n=3)]) features_enc, feature_defs_enc = encode_features(features, feature_defs, include_unknown=True) assert topn.unique_name() in [ feat.unique_name() for feat in feature_defs_enc ] for name in topn.get_feature_names(): assert name in features_enc.columns assert features_enc.columns.tolist().count(name) == 1
def test_rename_featureoutputslice(es): multi_output_feat = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) feat = ft.feature_base.FeatureOutputSlice(multi_output_feat, 0) new_name = 'session_test' new_names = ['session_test'] check_rename(feat, new_name, new_names)
def test_multi_output_base_error_agg(es): three_common = NMostCommon(3) tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=three_common) error_text = "Cannot stack on whole multi-output feature." with pytest.raises(ValueError, match=error_text): ft.Feature(tc, parent_entity=es['customers'], primitive=NumUnique)
def test_multioutput_description(es): n_most_common = NMostCommon(2) n_most_common_feature = AggregationFeature(IdentityFeature(es['log'].ww['zipcode']), 'sessions', n_most_common) first_most_common_slice = n_most_common_feature[0] second_most_common_slice = n_most_common_feature[1] n_most_common_base = 'The 2 most common values of the "zipcode" of all instances of "log" for each "id" in "sessions".' n_most_common_first = 'The most common value of the "zipcode" of all instances of "log" ' \ 'for each "id" in "sessions".' n_most_common_second = 'The 2nd most common value of the "zipcode" of all instances of ' \ '"log" for each "id" in "sessions".' assert describe_feature(n_most_common_feature) == n_most_common_base assert describe_feature(first_most_common_slice) == n_most_common_first assert describe_feature(second_most_common_slice) == n_most_common_second class CustomMultiOutput(TransformPrimitive): name = "custom_multioutput" input_types = [ColumnSchema(semantic_tags={'category'})] return_type = ColumnSchema(semantic_tags={'category'}) number_output_features = 4 custom_feat = TransformFeature(IdentityFeature(es['log'].ww['zipcode']), CustomMultiOutput) generic_base = 'The result of applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_first = 'The 1st output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' generic_second = 'The 2nd output from applying CUSTOM_MULTIOUTPUT to the "zipcode".' assert describe_feature(custom_feat) == generic_base assert describe_feature(custom_feat[0]) == generic_first assert describe_feature(custom_feat[1]) == generic_second CustomMultiOutput.description_template = ['the multioutput of {}', 'the {nth_slice} multioutput part of {}'] template_base = 'The multioutput of the "zipcode".' template_first_slice = 'The 1st multioutput part of the "zipcode".' template_second_slice = 'The 2nd multioutput part of the "zipcode".' template_third_slice = 'The 3rd multioutput part of the "zipcode".' template_fourth_slice = 'The 4th multioutput part of the "zipcode".' assert describe_feature(custom_feat) == template_base assert describe_feature(custom_feat[0]) == template_first_slice assert describe_feature(custom_feat[1]) == template_second_slice assert describe_feature(custom_feat[2]) == template_third_slice assert describe_feature(custom_feat[3]) == template_fourth_slice CustomMultiOutput.description_template = ['the multioutput of {}', 'the primary multioutput part of {}', 'the secondary multioutput part of {}'] custom_base = 'The multioutput of the "zipcode".' custom_first_slice = 'The primary multioutput part of the "zipcode".' custom_second_slice = 'The secondary multioutput part of the "zipcode".' bad_slice_error = 'Slice out of range of template' assert describe_feature(custom_feat) == custom_base assert describe_feature(custom_feat[0]) == custom_first_slice assert describe_feature(custom_feat[1]) == custom_second_slice with pytest.raises(IndexError, match=bad_slice_error): describe_feature(custom_feat[2])
def test_serialized_renamed_features(es): def serialize_name_unchanged(original): new_name = 'MyFeature' original_names = original.get_feature_names() renamed = original.rename(new_name) new_names = [new_name] if len(original_names) == 1 else [ new_name + '[{}]'.format(i) for i in range(len(original_names)) ] check_names(renamed, new_name, new_names) serializer = FeaturesSerializer([renamed]) serialized = serializer.to_dict() deserializer = FeaturesDeserializer(serialized) deserialized = deserializer.to_list()[0] check_names(deserialized, new_name, new_names) identity_original = ft.IdentityFeature(es['log'].ww['value']) assert identity_original.get_name() == 'value' value = ft.IdentityFeature(es['log'].ww['value']) primitive = ft.primitives.Max() agg_original = ft.AggregationFeature(value, 'customers', primitive) assert agg_original.get_name() == 'MAX(log.value)' direct_original = ft.DirectFeature( ft.IdentityFeature(es['customers'].ww['age']), 'sessions') assert direct_original.get_name() == 'customers.age' primitive = ft.primitives.MultiplyNumericScalar(value=2) transform_original = ft.TransformFeature(value, primitive) assert transform_original.get_name() == 'value * 2' zipcode = ft.IdentityFeature(es['log'].ww['zipcode']) primitive = CumSum() groupby_original = ft.feature_base.GroupByTransformFeature( value, primitive, zipcode) assert groupby_original.get_name() == 'CUM_SUM(value) by zipcode' multioutput_original = ft.Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) assert multioutput_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)' featureslice_original = ft.feature_base.FeatureOutputSlice( multioutput_original, 0) assert featureslice_original.get_name( ) == 'N_MOST_COMMON(log.product_id, n=2)[0]' feature_type_list = [ identity_original, agg_original, direct_original, transform_original, groupby_original, multioutput_original, featureslice_original ] for feature_type in feature_type_list: serialize_name_unchanged(feature_type)
def test_rename_multioutput(es): feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) new_name = "session_test" new_names = ["session_test[0]", "session_test[1]"] check_rename(feat, new_name, new_names)
def test_set_feature_names_aggregation_feature(es): feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) new_names = ["agg_col_1", "second_agg_col"] feat.set_feature_names(new_names) assert feat.get_feature_names() == new_names
def test_multi_output_base_error_agg(es): three_common = NMostCommon(3) tc = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="sessions", primitive=three_common, ) error_text = "Cannot stack on whole multi-output feature." with pytest.raises(ValueError, match=error_text): ft.Feature(tc, parent_dataframe_name="customers", primitive=NumUnique)
def test_rename_featureoutputslice(es): multi_output_feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feat = ft.feature_base.FeatureOutputSlice(multi_output_feat, 0) new_name = "session_test" new_names = ["session_test"] check_rename(feat, new_name, new_names)
def test_set_feature_names_not_unique(es): feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) new_names = ["col1", "col1"] error_msg = "Provided output feature names must be unique." with pytest.raises(ValueError, match=error_msg): feat.set_feature_names(new_names)
def test_direct_rename_multioutput(es): n_common = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, es['sessions']) copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_direct_rename_multioutput(es): n_common = Feature(es['log'].ww['product_id'], parent_dataframe_name='customers', primitive=NMostCommon(n=2)) feat = DirectFeature(n_common, 'sessions') copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name( ) == copy_feat.base_features[0].generate_name() assert feat.dataframe_name == copy_feat.dataframe_name
def test_set_feature_names_wrong_number_of_names(es): feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) new_names = ["col1"] error_msg = re.escape( "Number of names provided must match the number of output features: 1 name(s) provided, 2 expected." ) with pytest.raises(ValueError, match=error_msg): feat.set_feature_names(new_names)
def test_renaming_resets_feature_output_names_to_default(es): feat = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) new_names = ["renamed1", "renamed2"] feat.set_feature_names(new_names) assert feat.get_feature_names() == new_names feat = feat.rename("new_feature_name") assert feat.get_feature_names() == ["new_feature_name[0]", "new_feature_name[1]"]
def test_direct_rename_multioutput(es): n_common = Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) feat = DirectFeature(n_common, "sessions") copy_feat = feat.rename("session_test") assert feat.unique_name() != copy_feat.unique_name() assert feat.get_name() != copy_feat.get_name() assert (feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name()) assert feat.dataframe_name == copy_feat.dataframe_name
def test_nmostcommon_categorical(): n_most = NMostCommon(3) expected = pd.Series([1.0, 2.0, np.nan]) ints = pd.Series([1, 2, 1, 1]).astype("int64") assert pd.Series(n_most(ints)).equals(expected) cats = pd.Series([1, 2, 1, 1]).astype("category") assert pd.Series(n_most(cats)).equals(expected) # Value counts includes data for categories that are not present in data. # Make sure these counts are not included in most common outputs extra_dtype = CategoricalDtype(categories=[1, 2, 3]) cats_extra = pd.Series([1, 2, 1, 1]).astype(extra_dtype) assert pd.Series(n_most(cats_extra)).equals(expected)
def test_seed_multi_output_feature_stacking(es): threecommon = NMostCommon(3) tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=threecommon) fm, feat = ft.dfs(entityset=es, target_entity="customers", seed_features=[tc], agg_primitives=[NumUnique], trans_primitives=[], max_depth=4 ) for i in range(3): f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i assert feature_with_name(feat, f)
def test_to_dictionary_multi_slice(es): slice_feature = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2))[0] expected = { 'type': 'FeatureOutputSlice', 'dependencies': ['customers: N_MOST_COMMON(log.product_id, n=2)'], 'arguments': { 'name': None, 'base_feature': 'customers: N_MOST_COMMON(log.product_id, n=2)', 'n': 0 } } assert expected == slice_feature.to_dictionary()
def test_custom_feature_names_retained_during_serialization(pd_es, tmpdir): class MultiCumulative(TransformPrimitive): name = "multi_cum_sum" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) number_output_features = 3 multi_output_trans_feat = ft.Feature( pd_es["log"].ww["value"], primitive=MultiCumulative ) groupby_trans_feat = ft.GroupByTransformFeature( pd_es["log"].ww["value"], primitive=MultiCumulative, groupby=pd_es["log"].ww["product_id"], ) multi_output_agg_feat = ft.Feature( pd_es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), ) slice = FeatureOutputSlice(multi_output_trans_feat, 1) stacked_feat = ft.Feature(slice, primitive=Negate) trans_names = ["cumulative_sum", "cumulative_max", "cumulative_min"] multi_output_trans_feat.set_feature_names(trans_names) groupby_trans_names = ["grouped_sum", "grouped_max", "grouped_min"] groupby_trans_feat.set_feature_names(groupby_trans_names) agg_names = ["first_most_common", "second_most_common"] multi_output_agg_feat.set_feature_names(agg_names) features = [ multi_output_trans_feat, multi_output_agg_feat, groupby_trans_feat, stacked_feat, ] file = os.path.join(tmpdir, "features.json") ft.save_features(features, file) deserialized_features = ft.load_features(file) new_trans, new_agg, new_groupby, new_stacked = deserialized_features assert new_trans.get_feature_names() == trans_names assert new_agg.get_feature_names() == agg_names assert new_groupby.get_feature_names() == groupby_trans_names assert new_stacked.get_feature_names() == ["-(cumulative_max)"]
def test_to_dictionary_multi_slice(es): slice_feature = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="customers", primitive=NMostCommon(n=2), )[0] expected = { "type": "FeatureOutputSlice", "dependencies": ["customers: N_MOST_COMMON(log.product_id, n=2)"], "arguments": { "name": "N_MOST_COMMON(log.product_id, n=2)[0]", "base_feature": "customers: N_MOST_COMMON(log.product_id, n=2)", "n": 0, }, } assert expected == slice_feature.to_dictionary()
def test_multi_output_features(es): product_id = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(product_id, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, product_id] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, ) ) features.append(tc[i]) serializer = FeaturesSerializer(features) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) expected = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } expected["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } expected["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" expected["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" expected["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" actual = serializer.to_dict() _compare_feature_dicts(expected, actual)
def test_multioutput_feature(es): value = ft.IdentityFeature(es["log"].ww["product_id"]) threecommon = NMostCommon() num_unique = NumUnique() tc = ft.Feature(value, parent_dataframe_name="sessions", primitive=threecommon) features = [tc, value] for i in range(3): features.append( ft.Feature( tc[i], parent_dataframe_name="customers", primitive=num_unique, )) features.append(tc[i]) flist = [feat.unique_name() for feat in features] fd = [feat.to_dictionary() for feat in features] fdict = dict(zip(flist, fd)) dictionary = { "ft_version": ft.__version__, "schema_version": SCHEMA_VERSION, "entityset": es.to_dictionary(), "feature_list": flist, "feature_definitions": fdict, } dictionary["primitive_definitions"] = { "0": serialize_primitive(threecommon), "1": serialize_primitive(num_unique), } dictionary["feature_definitions"][flist[0]]["arguments"]["primitive"] = "0" dictionary["feature_definitions"][flist[2]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[4]]["arguments"]["primitive"] = "1" dictionary["feature_definitions"][flist[6]]["arguments"]["primitive"] = "1" deserializer = FeaturesDeserializer(dictionary).to_list() for i in range(len(features)): assert features[i].unique_name() == deserializer[i].unique_name()
def test_topn(es, backend): topn = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) pandas_backend = backend([topn]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) true_results = pd.DataFrame( [['toothpaste', 'coke zero'], ['coke zero', 'Haribo sugar-free gummy bears'], ['taco clock', np.nan]]) assert ([name in df.columns for name in topn.get_feature_names()]) for i in range(df.shape[0]): if i == 0: # coke zero and toothpase have same number of occurrences assert set(true_results.loc[i].values) == set(df.loc[i].values) else: for i1, i2 in zip(true_results.loc[i], df.iloc[i]): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)