def test_direct_percentile(es): v = Feature(es['customers']['age']) p = Percentile(v) d = Feature(p, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) cust_vals = es['customers'].df[[v.get_name()]] cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True) true_p = cust_vals['percentile'].loc[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['sessions']) pandas_backend = PandasBackend(es, [agg]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]] for t, a in zip(true_p.values, df[agg.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_agg_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) agg = Sum(p, es['customers']) d = Feature(agg, es['sessions']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features([0, 1], None) log_vals = es['log'].df[[v.get_name(), 'session_id']] log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True) log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2 true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0) true_p = true_p[[0, 0]] for t, a in zip(true_p.values, df[d.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [v, p]) df = pandas_backend.calculate_all_features(range(17), None) true = df[v.get_name()].rank(pct=True) for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_isin_feat_other_syntax_int(es): isin = Feature(es['log']['value']).isin([5, 10]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [False, True, True, False, False, False, False, False] v = df[isin.get_name()].values.tolist() assert true == v
def test_isin_feat_other_syntax(es): isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"]) features = [isin] pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(range(8), None) true = [True, True, True, False, False, True, True, True] v = df[isin.get_name()].values.tolist() assert true == v
def test_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) pandas_backend = PandasBackend(es, [p]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_dependent_percentile(es): v = Feature(es['log']['value']) p = Percentile(v) p2 = Percentile(p - 1) pandas_backend = PandasBackend(es, [p, p2]) df = pandas_backend.calculate_all_features(range(10, 17), None) true = es['log'].df[v.get_name()].rank(pct=True) true = true.loc[range(10, 17)] for t, a in zip(true.values, df[p.get_name()].values): assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_max_hlevel(es): kwargs = dict( target_entity_id='log', entityset=es, agg_primitives=[Count, Last], trans_primitives=[Hour], max_depth=-1, ) dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs) dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs) dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs) feats_n1 = dfs_h_n1.build_features() feats_n1 = [f.get_name() for f in feats_n1] feats_0 = dfs_h_0.build_features() feats_0 = [f.get_name() for f in feats_0] feats_1 = dfs_h_1.build_features() feats_1 = [f.get_name() for f in feats_1] customer_log = Last(es['log']['value'], es['customers']) session_log = Last(es['log']['value'], es['sessions']) log_customer_log = Feature(customer_log, es['log']) log_session_log = Feature(session_log, es['log']) assert log_customer_log.get_name() in feats_n1 assert log_session_log.get_name() in feats_n1 assert log_customer_log.get_name() not in feats_1 assert log_session_log.get_name() in feats_1 assert log_customer_log.get_name() not in feats_0 assert log_session_log.get_name() not in feats_0