def test_init_and_name(es): log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives(): if issubclass(transform_prim, Compare): continue # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = [ g for s in input_types for g in match(s, features) ] else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() instance.head()
def test_make_transform_sets_kwargs_correctly(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin_1_list = ["toothpaste", "coke_zero"] isin_1_base_f = Feature(es['log']['product_id']) isin_1 = IsIn(isin_1_base_f, list_of_outputs=isin_1_list) isin_2_list = ["coke_zero"] isin_2_base_f = Feature(es['log']['session_id']) isin_2 = IsIn(isin_2_base_f, list_of_outputs=isin_2_list) assert isin_1_base_f == isin_1.base_features[0] assert isin_1_list == isin_1.kwargs['list_of_outputs'] assert isin_2_base_f == isin_2.base_features[0] assert isin_2_list == isin_2.kwargs['list_of_outputs']
def test_max_hlevel(es): kwargs = dict( target_entity_id='log', entityset=es, agg_primitives=[Count, Last], trans_primitives=[Hour], max_depth=-1, ) dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs) dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs) dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs) feats_n1 = dfs_h_n1.build_features() feats_n1 = [f.get_name() for f in feats_n1] feats_0 = dfs_h_0.build_features() feats_0 = [f.get_name() for f in feats_0] feats_1 = dfs_h_1.build_features() feats_1 = [f.get_name() for f in feats_1] customer_log = Last(es['log']['value'], es['customers']) session_log = Last(es['log']['value'], es['sessions']) log_customer_log = Feature(customer_log, es['log']) log_session_log = Feature(session_log, es['log']) assert log_customer_log.get_name() in feats_n1 assert log_session_log.get_name() in feats_n1 assert log_customer_log.get_name() not in feats_1 assert log_session_log.get_name() in feats_1 assert log_customer_log.get_name() not in feats_0 assert log_session_log.get_name() not in feats_0
def test_isin_feat_other_syntax(es): isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v
def test_isin_feat_other_syntax_int(es): isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [v, p]) df = pandas_backend.calculate_all_features(range(17), None) true = df[v.get_name()].rank(pct=True) for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_dependent_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) p2 = Percentile(p - 1) pandas_backend = PandasBackend(es, [p, p2]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_return_type_inference_id(es): # direct features should keep Id variable type direct_id_feature = Feature(es["sessions"]["customer_id"], es["log"]) assert direct_id_feature.variable_type == Id # aggregations of Id variable types should get converted mode = Mode(es["log"]["session_id"], es["customers"]) assert mode.variable_type == Categorical # also test direct feature of aggregation mode_direct = Feature(mode, es["sessions"]) assert mode_direct.variable_type == Categorical
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_percentile(es): v = Feature(es['customers']['age']) p = Percentile(v) d = Feature(p, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_get_depth(es): log_id_feat = es['log']['id'] customer_id_feat = es['customers']['id'] count_logs = Count(log_id_feat, parent_entity=es['sessions']) sum_count_logs = Sum(count_logs, parent_entity=es['customers']) num_logs_greater_than_5 = sum_count_logs > 5 count_customers = Count(customer_id_feat, parent_entity=es[u'régions'], where=num_logs_greater_than_5) num_customers_region = Feature(count_customers, es["customers"]) depth = num_customers_region.get_depth() assert depth == 5
def test_direct_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['customers']) d = Feature(agg, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0) true_p = true_p[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_get_depth(es): log_id_feat = es['log']['id'] customer_id_feat = es['customers']['id'] count_logs = Count(log_id_feat, parent_entity=es['sessions']) sum_count_logs = Sum(count_logs, parent_entity=es['customers']) num_logs_greater_than_5 = sum_count_logs > 5 count_customers = Count(customer_id_feat, parent_entity=es['regions'], where=num_logs_greater_than_5) num_customers_region = Feature(count_customers, es["customers"]) depth = num_customers_region.get_depth() assert depth == 5
def test_percentile_with_cutoff(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features( [2], pd.Timestamp('2011/04/09 10:30:13')) assert df[p.get_name()].tolist()[0] == 1.0
def test_custom_primitive_default_kwargs(es): def sum_n_times(numeric, n=1): return np.nan_to_num(numeric).sum(dtype=np.float) * n SumNTimes = make_agg_primitive(function=sum_n_times, input_types=[Numeric], return_type=Numeric) sum_n_1_n = 1 sum_n_1_base_f = Feature(es['log']['value']) sum_n_1 = SumNTimes([sum_n_1_base_f], es['sessions'], n=sum_n_1_n) sum_n_2_n = 2 sum_n_2_base_f = Feature(es['log']['value_2']) sum_n_2 = SumNTimes([sum_n_2_base_f], es['sessions'], n=sum_n_2_n) assert sum_n_1_base_f == sum_n_1.base_features[0] assert sum_n_1_n == sum_n_1.kwargs['n'] assert sum_n_2_base_f == sum_n_2.base_features[0] assert sum_n_2_n == sum_n_2.kwargs['n']
def test_get_depth(es): es = make_ecommerce_entityset() f = Feature(es['log']['value']) g = Feature(es['log']['value']) agg1 = Last(f, es['sessions']) agg2 = Last(agg1, es['customers']) d1 = Feature(agg2, es['sessions']) d2 = Feature(d1, es['log']) assert d2.get_depth() == 4 # Make sure this works if we pass in two of the same # feature. This came up when user supplied duplicates # in the seed_features of DFS. assert d2.get_depth(stop_at=[f, g]) == 4 assert d2.get_depth(stop_at=[f, g, agg1]) == 3 assert d2.get_depth(stop_at=[f, g, agg1]) == 3 assert d2.get_depth(stop_at=[f, g, agg2]) == 2 assert d2.get_depth(stop_at=[f, g, d1]) == 1 assert d2.get_depth(stop_at=[f, g, d2]) == 0
def test_two_kinds_of_dependents(es): v = Feature(es['log']['value']) product = Feature(es['log']['product_id']) agg = Sum(v, es['customers'], where=product == 'coke zero') p = Percentile(agg) g = Absolute(agg) agg2 = Sum(v, es['sessions'], where=product == 'coke zero') # Adding this feature in tests line 218 in pandas_backend # where we remove columns in result_frame that already exist # in the output entity_frames in preparation for pd.concat # In a prior version, this failed because we changed the result_frame # variable itself, rather than making a new variable _result_frame. # When len(output_frames) > 1, the second iteration won't have # all the necessary columns because they were removed in the first agg3 = Sum(agg2, es['customers']) pandas_backend = PandasBackend(es, [p, g, agg3]) df = pandas_backend.calculate_all_features([0, 1], None) assert df[p.get_name()].tolist() == [0.5, 1.0] assert df[g.get_name()].tolist() == [15, 26]
def test_get_dependencies(es): f = Feature(es['log']['value']) agg1 = Sum(f, es['sessions']) agg2 = Sum(agg1, es['customers']) d1 = Feature(agg2, es['sessions']) shallow = d1.get_dependencies(deep=False, ignored=None) deep = d1.get_dependencies(deep=True, ignored=None) ignored = set([agg1.hash()]) deep_ignored = d1.get_dependencies(deep=True, ignored=ignored) assert [s.hash() for s in shallow] == [agg2.hash()] assert [d.hash() for d in deep] == [agg2.hash(), agg1.hash(), f.hash()] assert [d.hash() for d in deep_ignored] == [agg2.hash()]
def test_isin_feat_custom(es): def pd_is_in(array, list_of_outputs=None): if list_of_outputs is None: list_of_outputs = [] return pd.Series(array).isin(list_of_outputs) def isin_generate_name(self): return u"%s.isin(%s)" % (self.base_features[0].get_name(), str(self.kwargs['list_of_outputs'])) IsIn = make_trans_primitive( pd_is_in, [Variable], Boolean, name="is_in", description="For each value of the base feature, checks whether it is " "in a list that is provided.", cls_attributes={"generate_name": isin_generate_name}) isin = IsIn(es['log']['product_id'], list_of_outputs=["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_init_and_name(es): from featuretools import calculate_feature_matrix log = es['log'] features = [Feature(v) for v in log.variables] +\ [GreaterThan(Feature(es["products"]["rating"], es["log"]), 2.5)] # Add Timedelta feature features.append(pd.Timestamp.now() - Feature(log['datetime'])) for transform_prim in get_transform_primitives().values(): # use the input_types matching function from DFS input_types = transform_prim.input_types if type(input_types[0]) == list: matching_inputs = match(input_types[0], features) else: matching_inputs = match(input_types, features) if len(matching_inputs) == 0: raise Exception("Transform Primitive %s not tested" % transform_prim.name) for s in matching_inputs: instance = transform_prim(*s) # try to get name and calculate instance.get_name() calculate_feature_matrix([instance], entityset=es).head(5)
def test_cum_sum_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, where=compare_feat) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 cum_sum_values = [0, 5, 15, 30, 50, 50, 50, 50, 50, 50, 0, 5, 5, 12, 26] for i, v in enumerate(cum_sum_values): if not np.isnan(v): assert v == cvalues[i] else: assert (np.isnan(cvalues[i]))
def test_cum_sum_use_previous_and_where_absolute(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_sum = CumSum(log_value_feat, dfeat, es["log"]["datetime"], where=compare_feat, use_previous=Timedelta(40, 'seconds')) features = [cum_sum] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_sum_values = [0, 5, 15, 30, 50, 0, 0, 0, 0, 0, 0, 5, 0, 7, 21] cvalues = df[cum_sum.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_sum_values): assert v == cvalues[i]
def test_cum_mean_use_previous_and_where(es): log_value_feat = es['log']['value'] compare_feat = GreaterThan(log_value_feat, 3) # todo should this be cummean? dfeat = Feature(es['sessions']['customer_id'], es['log']) cum_mean = CumMean(log_value_feat, dfeat, where=compare_feat, use_previous=Timedelta(2, 'observations', entity=es['log'])) features = [cum_mean] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) cum_mean_values = [0, 5, 7.5, 12.5, 17.5, 17.5, 17.5, 17.5, 17.5, 17.5, 0, 5, 5, 6, 10.5] cvalues = df[cum_mean.get_name()].values assert len(cvalues) == 15 for i, v in enumerate(cum_mean_values): assert v == cvalues[i]
def test_init_and_name(es): session = es['sessions'] log = es['log'] features = [Feature(v) for v in log.variables] for agg_prim in get_aggregation_primitives().values(): input_types = agg_prim.input_types if type(input_types[0]) != list: input_types = [input_types] # test each allowed input_types for this primitive for it in input_types: # use the input_types matching function from DFS matching_types = match(it, features) if len(matching_types) == 0: raise Exception("Agg Primitive %s not tested" % agg_prim.name) for t in matching_types: instance = agg_prim(t, parent_entity=session) # try to get name and calculate instance.get_name() ft.calculate_feature_matrix([instance], entityset=es).head(5)
def test_return_type_inference_direct_feature(es): mode = Mode(es["log"]["priority_level"], es["customers"]) mode_session = Feature(mode, es["sessions"]) assert mode_session.variable_type == es["log"]["priority_level"].__class__
def test_squared(es): feature = Feature(es['log']['value']) squared = feature * feature assert len(squared.base_features) == 1 assert squared.base_features[0].hash() == feature.hash()
def session_id_feat(es): return Feature(es['sessions']['id'])
def product_id_feat(es): return Feature(es['log']['product_id'])
def datetime_feat(es): return Feature(es['log']['datetime'])
def test_overrides(es): value = Feature(es['log']['value']) value2 = Feature(es['log']['value_2']) feats = [Add, Subtract, Multiply, Divide] compare_ops = [ GreaterThan, LessThan, Equals, NotEquals, GreaterThanEqualTo, LessThanEqualTo ] assert Negate(value).hash() == (-value).hash() compares = [(value, value), (value, value2), (value2, 2)] overrides = [ value + value, value - value, value * value, value / value, value > value, value < value, value == value, value != value, value >= value, value <= value, value + value2, value - value2, value * value2, value / value2, value > value2, value < value2, value == value2, value != value2, value >= value2, value <= value2, value2 + 2, value2 - 2, value2 * 2, value2 / 2, value2 > 2, value2 < 2, value2 == 2, value2 != 2, value2 >= 2, value2 <= 2, ] i = 0 for left, right in compares: for feat in feats: f = feat(left, right) o = overrides[i] assert o.hash() == f.hash() i += 1 for compare_op in compare_ops: f = compare_op(left, right) o = overrides[i] assert o.hash() == f.hash() i += 1 our_reverse_overrides = [2 + value2, 2 - value2, 2 * value2, 2 / value2] i = 0 for feat in feats: if feat != Mod: f = feat(2, value2) o = our_reverse_overrides[i] assert o.hash() == f.hash() i += 1 python_reverse_overrides = [ 2 < value2, 2 > value2, 2 == value2, 2 != value2, 2 <= value2, 2 >= value2 ] i = 0 for compare_op in compare_ops: f = compare_op(value2, 2) o = python_reverse_overrides[i] assert o.hash() == f.hash() i += 1