def test_make_transform_restricts_time_arg(): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="AllowedPrimitive", description="This primitive should be accepted", uses_calc_time=True) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_trans_primitive(lambda time: time, [Datetime], Numeric, name="BadPrimitive", description="This primitive should erorr")
def test_make_transform_sets_kwargs_correctly(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self, base_feature_names): return u"%s.isin(%s)" % (base_feature_names[0], str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin_1_list = ["toothpaste", "coke_zero"] isin_1_base_f = ft.Feature(es['log']['product_id']) isin_1 = ft.Feature(isin_1_base_f, primitive=IsIn(list_of_outputs=isin_1_list)) isin_2_list = ["coke_zero"] isin_2_base_f = ft.Feature(es['log']['session_id']) isin_2 = ft.Feature(isin_2_base_f, primitive=IsIn(list_of_outputs=isin_2_list)) assert isin_1_base_f == isin_1.base_features[0] assert isin_1_list == isin_1.primitive.kwargs['list_of_outputs'] assert isin_2_base_f == isin_2.base_features[0] assert isin_2_list == isin_2.primitive.kwargs['list_of_outputs']
def test_make_transform_sets_kwargs_correctly(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin_1_list = ["toothpaste", "coke_zero"] isin_1_base_f = Feature(es['log']['product_id']) isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list) isin_2_list = ["coke_zero"] isin_2_base_f = Feature(es['log']['session_id']) isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list) assert isin_1_base_f == isin_1.base_features[0] assert isin_1_list == isin_1.kwargs['list_of_outputs'] assert isin_2_base_f == isin_2.base_features[0] assert isin_2_list == isin_2.kwargs['list_of_outputs']
def test_override_multi_feature_names(es): def gen_custom_names(primitive, base_feature_names): return [ 'Above18(%s)' % base_feature_names, 'Above21(%s)' % base_feature_names, 'Above65(%s)' % base_feature_names ] def is_greater(x): return x > 18, x > 21, x > 65 num_features = 3 IsGreater = make_trans_primitive( function=is_greater, input_types=[Numeric], return_type=Numeric, number_output_features=num_features, cls_attributes={"generate_names": gen_custom_names}) fm, features = ft.dfs(entityset=es, target_entity="customers", instance_ids=[0, 1, 2], agg_primitives=[], trans_primitives=[IsGreater]) expected_names = gen_custom_names(IsGreater, ['age']) for name in expected_names: assert name in fm.columns
def test_make_transform_restricts_time_arg(): make_trans_primitive( lambda time: time, [Datetime], Numeric, name="AllowedPrimitive", description="This primitive should be accepted", uses_calc_time=True) error_text = "'time' is a restricted keyword. Please use a different keyword." with pytest.raises(ValueError, match=error_text): make_trans_primitive( lambda time: time, [Datetime], Numeric, name="BadPrimitive", description="This primitive should erorr")
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] if isinstance(array, dd.Series): return array.isin(list_of_outputs) return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self, base_feature_names): return u"%s.isin(%s)" % (base_feature_names[0], str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = ft.Feature( es['log']['product_id'], primitive=IsIn(list_of_outputs=["toothpaste", "coke zero"])) features = [isin] df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)) if isinstance(df, dd.DataFrame): df = df.compute() true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = ft.Feature(es['log']['product_id']).isin( ["toothpaste", "coke zero"]) features = [isin] df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)) if isinstance(df, dd.DataFrame): df = df.compute() true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = ft.Feature(es['log']['value']).isin([5, 10]) features = [isin] df = ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)) if isinstance(df, dd.DataFrame): df = df.compute() true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return array.isin(list_of_outputs) def isin_generate_name(self, base_feature_names): return u"%s.isin(%s)" % (base_feature_names[0], str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [ColumnSchema()], ColumnSchema(logical_type=Boolean), name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = ft.Feature( es['log'].ww['product_id'], primitive=IsIn(list_of_outputs=["toothpaste", "coke zero"])) features = [isin] df = to_pandas(ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)), index='id', sort_index=True) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() assert true == v isin = ft.Feature(es['log'].ww['product_id']).isin( ["toothpaste", "coke zero"]) features = [isin] df = to_pandas(ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)), index='id', sort_index=True) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].tolist() assert true == v isin = ft.Feature(es['log'].ww['value']).isin([5, 10]) features = [isin] df = to_pandas(ft.calculate_feature_matrix(entityset=es, features=features, instance_ids=range(8)), index='id', sort_index=True) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].tolist() assert true == v
def test_make_transform_multiple_output_features(pd_es): def test_time(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] def gen_feat_names(self): subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"] return [ "Now.%s(%s)" % (subname, self.base_features[0].get_name()) for subname in subnames ] TestTime = make_trans_primitive( function=test_time, input_types=[ColumnSchema(logical_type=Datetime)], return_type=ColumnSchema(semantic_tags={'numeric'}), number_output_features=6, cls_attributes={"get_feature_names": gen_feat_names}, ) join_time_split = ft.Feature(pd_es["log"].ww["datetime"], primitive=TestTime) alt_features = [ ft.Feature(pd_es["log"].ww["datetime"], primitive=Year), ft.Feature(pd_es["log"].ww["datetime"], primitive=Month), ft.Feature(pd_es["log"].ww["datetime"], primitive=Day), ft.Feature(pd_es["log"].ww["datetime"], primitive=Hour), ft.Feature(pd_es["log"].ww["datetime"], primitive=Minute), ft.Feature(pd_es["log"].ww["datetime"], primitive=Second) ] fm, fl = ft.dfs(entityset=pd_es, target_dataframe_name="log", agg_primitives=['sum'], trans_primitives=[ TestTime, Year, Month, Day, Hour, Minute, Second, Diff ], max_depth=5) subnames = join_time_split.get_feature_names() altnames = [f.get_name() for f in alt_features] for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all() for i in range(6): f = 'sessions.customers.SUM(log.TEST_TIME(datetime)[%d])' % i assert feature_with_name(fl, f) assert ('products.DIFF(SUM(log.TEST_TIME(datetime)[%d]))' % i) in fl
def test_groupby_with_multioutput_primitive(pd_es): def multi_cum_sum(x): return x.cumsum(), x.cummax(), x.cummin() num_features = 3 MultiCumSum = make_trans_primitive(function=multi_cum_sum, input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), number_output_features=num_features) fm, _ = dfs(entityset=pd_es, target_dataframe_name='customers', trans_primitives=[], agg_primitives=[], groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin]) # Calculate output in a separate DFS call to make sure the multi-output code # does not alter any values fm2, _ = dfs(entityset=pd_es, target_dataframe_name='customers', trans_primitives=[], agg_primitives=[], groupby_trans_primitives=[CumSum, CumMax, CumMin]) answer_cols = [ ['CUM_SUM(age) by cohort', 'CUM_SUM(age) by région_id'], ['CUM_MAX(age) by cohort', 'CUM_MAX(age) by région_id'], ['CUM_MIN(age) by cohort', 'CUM_MIN(age) by région_id'] ] for i in range(3): # Check that multi-output gives correct answers f = 'MULTI_CUM_SUM(age)[%d] by cohort' % i assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values): assert x == y f = 'MULTI_CUM_SUM(age)[%d] by région_id' % i assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values): assert x == y # Verify single output results are unchanged by inclusion of # multi-output primitive for x, y in zip(fm[answer_cols[i][0]], fm2[answer_cols[i][0]]): assert x == y for x, y in zip(fm[answer_cols[i][1]], fm2[answer_cols[i][1]]): assert x == y
def test_make_transform_multiple_output_features(es): def test_f(x): times = pd.Series(x) units = ["year", "month", "day", "hour", "minute", "second"] return [times.apply(lambda x: getattr(x, unit)) for unit in units] def gen_feat_names(self): subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"] return [ "Now.%s(%s)" % (subname, self.base_features[0].get_name()) for subname in subnames ] TestTime = make_trans_primitive( function=test_f, input_types=[Datetime], return_type=Numeric, number_output_features=6, cls_attributes={"get_feature_names": gen_feat_names}, ) join_time_split = ft.Feature(es["log"]["datetime"], primitive=TestTime) alt_features = [ ft.Feature(es["log"]["datetime"], primitive=Year), ft.Feature(es["log"]["datetime"], primitive=Month), ft.Feature(es["log"]["datetime"], primitive=Day), ft.Feature(es["log"]["datetime"], primitive=Hour), ft.Feature(es["log"]["datetime"], primitive=Minute), ft.Feature(es["log"]["datetime"], primitive=Second) ] fm, fl = ft.dfs( entityset=es, target_entity="log", trans_primitives=[TestTime, Year, Month, Day, Hour, Minute, Second]) subnames = join_time_split.get_feature_names() altnames = [f.get_name() for f in alt_features] for col1, col2 in zip(subnames, altnames): assert (fm[col1] == fm[col2]).all() # check no feature stacked on new primitive for feature in fl: for base_feature in feature.base_features: assert base_feature.hash() != join_time_split.hash()
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = IsIn(es['log']['product_id'], list_of_outputs=["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = IsIn(es['log']['product_id'], list_of_outputs=["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_groupby_with_multioutput_primitive_custom_names(pd_es): def gen_custom_names(primitive, base_feature_names): return ["CUSTOM_SUM", "CUSTOM_MAX", "CUSTOM_MIN"] def multi_cum_sum(x): return x.cumsum(), x.cummax(), x.cummin() num_features = 3 MultiCumSum = make_trans_primitive(function=multi_cum_sum, input_types=[ColumnSchema(semantic_tags={'numeric'})], return_type=ColumnSchema(semantic_tags={'numeric'}), number_output_features=num_features, cls_attributes={"generate_names": gen_custom_names}) fm, _ = dfs(entityset=pd_es, target_dataframe_name='customers', trans_primitives=[], agg_primitives=[], groupby_trans_primitives=[MultiCumSum, CumSum, CumMax, CumMin]) answer_cols = [ ['CUM_SUM(age) by cohort', 'CUM_SUM(age) by région_id'], ['CUM_MAX(age) by cohort', 'CUM_MAX(age) by région_id'], ['CUM_MIN(age) by cohort', 'CUM_MIN(age) by région_id'] ] expected_names = [ ['CUSTOM_SUM by cohort', 'CUSTOM_SUM by région_id'], ['CUSTOM_MAX by cohort', 'CUSTOM_MAX by région_id'], ['CUSTOM_MIN by cohort', 'CUSTOM_MIN by région_id'] ] for i in range(3): f = expected_names[i][0] assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][0]].values): assert x == y f = expected_names[i][1] assert f in fm.columns for x, y in zip(fm[f].values, fm[answer_cols[i][1]].values): assert x == y
def test_groupby_multi_output_stacking(pd_es): TestTime = make_trans_primitive( function=lambda x: x, name="test_time", input_types=[Datetime], return_type=Numeric, number_output_features=6, ) fl = dfs(entityset=pd_es, target_entity="sessions", agg_primitives=['sum'], groupby_trans_primitives=[TestTime], features_only=True, max_depth=4) for i in range(6): f = 'SUM(log.TEST_TIME(datetime)[%d] by product_id)' % i assert feature_with_name(fl, f) assert ('customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)' % i) in fl
def test_groupby_multi_output_stacking(es): TestTime = make_trans_primitive( function=lambda x: x, name="test_time", input_types=[Datetime], return_type=Numeric, number_output_features=6, ) fl = dfs( entityset=es, target_entity="sessions", agg_primitives=[], trans_primitives=[TestTime], groupby_trans_primitives=[CumSum], features_only=True, max_depth=4) for i in range(6): f = 'customers.CUM_SUM(TEST_TIME(upgrade_date)[%d]) by cohort' % i assert feature_with_name(fl, f) assert ('customers.CUM_SUM(TEST_TIME(date_of_birth)[%d]) by customer_id' % i) in fl