Пример #1
0
def test_direct_percentile(es):
    v = Feature(es['customers']['age'])
    p = Percentile(v)
    d = Feature(p, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    cust_vals = es['customers'].df[[v.get_name()]]
    cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True)
    true_p = cust_vals['percentile'].loc[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Пример #2
0
def test_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['sessions'])
    pandas_backend = PandasBackend(es, [agg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_direct_percentile(es):
    v = Feature(es['customers']['age'])
    p = Percentile(v)
    d = Feature(p, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    cust_vals = es['customers'].df[[v.get_name()]]
    cust_vals['percentile'] = cust_vals[v.get_name()].rank(pct=True)
    true_p = cust_vals['percentile'].loc[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['sessions'])
    pandas_backend = PandasBackend(es, [agg])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    true_p = log_vals.groupby('session_id')['percentile'].sum()[[0, 1]]
    for t, a in zip(true_p.values, df[agg.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Пример #5
0
def test_direct_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['customers'])
    d = Feature(agg, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2
    true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0)
    true_p = true_p[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_direct_agg_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    agg = Sum(p, es['customers'])
    d = Feature(agg, es['sessions'])
    pandas_backend = PandasBackend(es, [d])
    df = pandas_backend.calculate_all_features([0, 1], None)

    log_vals = es['log'].df[[v.get_name(), 'session_id']]
    log_vals['percentile'] = log_vals[v.get_name()].rank(pct=True)
    log_vals['customer_id'] = [0] * 10 + [1] * 5 + [2] * 2
    true_p = log_vals.groupby('customer_id')['percentile'].sum().fillna(0)
    true_p = true_p[[0, 0]]
    for t, a in zip(true_p.values, df[d.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or round(t, 3) == round(a, 3)
def test_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    pandas_backend = PandasBackend(es, [v, p])
    df = pandas_backend.calculate_all_features(range(17), None)
    true = df[v.get_name()].rank(pct=True)
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_isin_feat_other_syntax_int(es):
    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_other_syntax(es):
    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_other_syntax_int(es):
    isin = Feature(es['log']['value']).isin([5, 10])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [False, True, True, False, False, False, False, False]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_isin_feat_other_syntax(es):
    isin = Feature(es['log']['product_id']).isin(["toothpaste", "coke zero"])
    features = [isin]
    pandas_backend = PandasBackend(es, features)
    df = pandas_backend.calculate_all_features(range(8), None)
    true = [True, True, True, False, False, True, True, True]
    v = df[isin.get_name()].values.tolist()
    assert true == v
def test_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    pandas_backend = PandasBackend(es, [p])
    df = pandas_backend.calculate_all_features(range(10, 17), None)
    true = es['log'].df[v.get_name()].rank(pct=True)
    true = true.loc[range(10, 17)]
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
Пример #13
0
def test_dependent_percentile(es):
    v = Feature(es['log']['value'])
    p = Percentile(v)
    p2 = Percentile(p - 1)
    pandas_backend = PandasBackend(es, [p, p2])
    df = pandas_backend.calculate_all_features(range(10, 17), None)
    true = es['log'].df[v.get_name()].rank(pct=True)
    true = true.loc[range(10, 17)]
    for t, a in zip(true.values, df[p.get_name()].values):
        assert (pd.isnull(t) and pd.isnull(a)) or t == a
def test_max_hlevel(es):
    kwargs = dict(
        target_entity_id='log',
        entityset=es,
        agg_primitives=[Count, Last],
        trans_primitives=[Hour],
        max_depth=-1,
    )

    dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs)
    dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs)
    dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs)
    feats_n1 = dfs_h_n1.build_features()
    feats_n1 = [f.get_name() for f in feats_n1]
    feats_0 = dfs_h_0.build_features()
    feats_0 = [f.get_name() for f in feats_0]
    feats_1 = dfs_h_1.build_features()
    feats_1 = [f.get_name() for f in feats_1]

    customer_log = Last(es['log']['value'], es['customers'])
    session_log = Last(es['log']['value'], es['sessions'])
    log_customer_log = Feature(customer_log, es['log'])
    log_session_log = Feature(session_log, es['log'])
    assert log_customer_log.get_name() in feats_n1
    assert log_session_log.get_name() in feats_n1

    assert log_customer_log.get_name() not in feats_1
    assert log_session_log.get_name() in feats_1

    assert log_customer_log.get_name() not in feats_0
    assert log_session_log.get_name() not in feats_0
def test_max_hlevel(es):
    kwargs = dict(
        target_entity_id='log',
        entityset=es,
        agg_primitives=[Count, Last],
        trans_primitives=[Hour],
        max_depth=-1,
    )

    dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs)
    dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs)
    dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs)
    feats_n1 = dfs_h_n1.build_features()
    feats_n1 = [f.get_name() for f in feats_n1]
    feats_0 = dfs_h_0.build_features()
    feats_0 = [f.get_name() for f in feats_0]
    feats_1 = dfs_h_1.build_features()
    feats_1 = [f.get_name() for f in feats_1]

    customer_log = Last(es['log']['value'], es['customers'])
    session_log = Last(es['log']['value'], es['sessions'])
    log_customer_log = Feature(customer_log, es['log'])
    log_session_log = Feature(session_log, es['log'])
    assert log_customer_log.get_name() in feats_n1
    assert log_session_log.get_name() in feats_n1

    assert log_customer_log.get_name() not in feats_1
    assert log_session_log.get_name() in feats_1

    assert log_customer_log.get_name() not in feats_0
    assert log_session_log.get_name() not in feats_0