def test_makes_direct_features_along_multiple_paths(diamond_es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
                                   entityset=diamond_es,
                                   max_depth=3,
                                   agg_primitives=[],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert feature_with_name(features, 'customers.regions.name')
    assert feature_with_name(features, 'stores.regions.name')
示例#2
0
def test_does_not_make_trans_of_single_direct_feature(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=['weekday'],
                                   max_depth=2)

    features = dfs_obj.build_features()

    assert not feature_with_name(features, 'WEEKDAY(customers.signup_date)')
    assert feature_with_name(features, 'customers.WEEKDAY(signup_date)')
示例#3
0
def test_makes_numtrue(es):
    if es.dataframe_type == Library.SPARK.value:
        pytest.xfail("Spark EntitySets do not support NumTrue primitive")
    dfs = DeepFeatureSynthesis(
        target_dataframe_name="sessions",
        entityset=es,
        agg_primitives=[NumTrue],
        trans_primitives=[],
    )
    features = dfs.build_features()
    assert feature_with_name(features, "customers.NUM_TRUE(log.purchased)")
    assert feature_with_name(features, "NUM_TRUE(log.purchased)")
示例#4
0
def test_does_not_make_agg_of_direct_of_target_entity(es):
    count_sessions = ft.Feature(es['sessions']["id"], parent_entity=es['customers'], primitive=Count)
    dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=2,
                                   seed_features=[count_sessions])
    features = dfs_obj.build_features()
    # this feature is meaningless because customers.COUNT(sessions) is already defined on
    # the customers entity
    assert not feature_with_name(features, 'LAST(sessions.customers.COUNT(sessions))')
    assert not feature_with_name(features, 'LAST(sessions.customers.age)')
示例#5
0
def test_makes_agg_features_with_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   where_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'COUNT(log WHERE priority_level = 0)'))

    # make sure they are made using direct features too
    assert (feature_with_name(features,
                              'COUNT(log WHERE products.department = food)'))
示例#6
0
def test_primitive_options_groupbys(es):
    options = {'cum_sum': {'include_groupby_variables': {'customers': [u'région_id']},
                           'ignore_groupby_variables': {'sessions': ['customer_id']}},
               'cum_mean': {'ignore_groupby_variables': {'customers': [u'région_id',
                                                                       'id']}},
               'cum_count': {'include_entities': ['customers'],
                             'include_groupby_variables': {'customers': [u"région_id",
                                                                         "cohort"]}},
               'cum_min': {'ignore_entities': ['customers']},
               'cum_max': {'include_entities': ['cohorts']}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
                                   entityset=es,
                                   groupby_trans_primitives=['cum_sum',
                                                             'cum_count',
                                                             'cum_min',
                                                             'cum_max',
                                                             'cum_mean'],
                                   primitive_options=options)
    features = dfs_obj.build_features()
    assert feature_with_name(features, u'CUM_SUM(age) by région_id')
    for f in features:
        # These either have nothing to groupby or don't include the target entity so shouldn't create features
        assert f.primitive.name not in ['cum_min', 'cum_max', 'cum_max']
        if isinstance(f.primitive, CumMean):
            assert f.groupby.variable.id not in [u'région_id', 'id']
        if isinstance(f.primitive, CumCount):
            assert f.groupby.variable.id in [u'région_id', 'cohort']
        if isinstance(f.primitive, CumSum):
            deps = f.get_dependencies()
            entities = [d.entity.id for d in deps]
            if 'customers' in entities:
                assert f.groupby.variable.id == u'région_id'
示例#7
0
def test_dfeats_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()

    # test to make sure we build direct features of agg features with where clause
    assert (feature_with_name(
        features, 'customers.COUNT(log WHERE priority_level = 0)'))

    assert (feature_with_name(
        features, 'COUNT(log WHERE products.department = electronics)'))
示例#8
0
def test_make_groupby_features_with_id(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[],
                                   groupby_trans_primitives=['cum_count'])
    features = dfs_obj.build_features()
    assert (feature_with_name(features, "CUM_COUNT(customer_id) by customer_id"))
示例#9
0
def test_makes_dfeatures_of_agg_primitives(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[])
    features = dfs_obj.build_features()
    assert (feature_with_name(features,
                              'customers.LAST(sessions.device_type)'))
示例#10
0
def test_initialized_agg_prim(es):
    ThreeMost = NMostCommon(n=3)
    dfs_obj = DeepFeatureSynthesis(target_entity_id="sessions",
                                   entityset=es,
                                   agg_primitives=[ThreeMost],
                                   trans_primitives=[])
    features = dfs_obj.build_features()
    assert (feature_with_name(features, "N_MOST_COMMON(log.product_id)"))
示例#11
0
def test_makes_trans_feat(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[Hour])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'HOUR(datetime)'))
示例#12
0
def test_handles_time_since_previous_entity_groupby(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   groupby_trans_primitives=[TimeSincePrevious])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'TIME_SINCE_PREVIOUS(datetime) by session_id'))
示例#13
0
def test_makes_agg_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'LAST(log.value)'))
示例#14
0
def test_transform_no_stack_agg(es):
    feature_defs = ft.dfs(entityset=es,
                          target_entity="customers",
                          agg_primitives=[NMostCommon],
                          trans_primitives=[NotEqual],
                          max_depth=3,
                          features_only=True)
    assert not feature_with_name(feature_defs, 'id != N_MOST_COMMON(sessions.device_type)')
示例#15
0
def test_make_groupby_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[],
                                   groupby_trans_primitives=['cum_sum'])
    features = dfs_obj.build_features()
    assert (feature_with_name(features, "CUM_SUM(value) by session_id"))
示例#16
0
def test_make_groupby_features_with_agg(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts',
                                   entityset=es,
                                   agg_primitives=['sum'],
                                   trans_primitives=[],
                                   groupby_trans_primitives=['cum_sum'])
    features = dfs_obj.build_features()
    agg_on_groupby_name = u"SUM(customers.CUM_SUM(age) by région_id)"
    assert (feature_with_name(features, agg_on_groupby_name))
示例#17
0
def test_intialized_trans_prim(es):
    prim = IsIn(list_of_outputs=['coke zero'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[prim])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, "product_id.isin(['coke zero'])"))
示例#18
0
def test_makes_direct_of_agg_of_trans_on_target(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=['mean'],
                                   trans_primitives=[Absolute],
                                   max_depth=3)

    features = dfs_obj.build_features()
    assert feature_with_name(features, 'sessions.MEAN(log.ABSOLUTE(value))')
示例#19
0
def test_make_groupby_features_with_diff_id(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[],
                                   groupby_trans_primitives=['cum_count'])
    features = dfs_obj.build_features()
    groupby_with_diff_id = u"CUM_COUNT(cohort) by région_id"
    assert (feature_with_name(features, groupby_with_diff_id))
示例#20
0
def test_makes_trans_of_multiple_direct_features(diamond_es):
    es = diamond_es
    dfs_obj = DeepFeatureSynthesis(target_entity_id='transactions',
                                   entityset=es,
                                   agg_primitives=['mean'],
                                   trans_primitives=[Equal],
                                   max_depth=4)

    features = dfs_obj.build_features()

    # Make trans of direct and non-direct
    assert feature_with_name(features, 'amount = stores.MEAN(transactions.amount)')

    # Make trans of direct features on different entities
    assert feature_with_name(features, 'customers.MEAN(transactions.amount) = stores.square_ft')

    # Make trans of direct features on same entity with different paths.
    assert feature_with_name(features, 'customers.regions.name = stores.regions.name')

    # Don't make trans of direct features with same path.
    assert not feature_with_name(features, 'stores.square_ft = stores.MEAN(transactions.amount)')
    assert not feature_with_name(features, 'stores.MEAN(transactions.amount) = stores.square_ft')

    # The naming of the below is confusing but this is a direct feature of a transform.
    assert feature_with_name(features, 'stores.MEAN(transactions.amount) = square_ft')
示例#21
0
def test_makes_count(es):
    dfs = DeepFeatureSynthesis(target_entity_id='sessions',
                               entityset=es,
                               agg_primitives=[Count],
                               trans_primitives=[])

    features = dfs.build_features()
    assert feature_with_name(features, 'device_type')
    assert feature_with_name(features, 'customer_id')
    assert feature_with_name(features, u'customers.région_id')
    assert feature_with_name(features, 'customers.age')
    assert feature_with_name(features, 'COUNT(log)')
    assert feature_with_name(features, 'customers.COUNT(sessions)')
    assert feature_with_name(features, u'customers.régions.language')
    assert feature_with_name(features, 'customers.COUNT(log)')
示例#22
0
def test_makes_count(es):
    dfs = DeepFeatureSynthesis(
        target_dataframe_name="sessions",
        entityset=es,
        agg_primitives=[Count],
        trans_primitives=[],
    )

    features = dfs.build_features()
    assert feature_with_name(features, "device_type")
    assert feature_with_name(features, "customer_id")
    assert feature_with_name(features, "customers.région_id")
    assert feature_with_name(features, "customers.age")
    assert feature_with_name(features, "COUNT(log)")
    assert feature_with_name(features, "customers.COUNT(sessions)")
    assert feature_with_name(features, "customers.régions.language")
    assert feature_with_name(features, "customers.COUNT(log)")
示例#23
0
def test_seed_multi_output_feature_stacking(es):
    threecommon = NMostCommon(3)
    tc = ft.Feature(es['log']['product_id'], parent_entity=es["sessions"], primitive=threecommon)

    fm, feat = ft.dfs(entityset=es,
                      target_entity="customers",
                      seed_features=[tc],
                      agg_primitives=[NumUnique],
                      trans_primitives=[],
                      max_depth=4
                      )

    for i in range(3):
        f = 'NUM_UNIQUE(sessions.N_MOST_COMMON(log.product_id)[%d])' % i
        assert feature_with_name(feat, f)
示例#24
0
def test_makes_direct_features_through_multiple_relationships(games_es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='games',
                                   entityset=games_es,
                                   agg_primitives=['mean'],
                                   trans_primitives=[])

    features = dfs_obj.build_features()

    teams = ['home', 'away']
    for forward in teams:
        for backward in teams:
            for var in teams:
                f = 'teams[%s_team_id].MEAN(games[%s_team_id].%s_team_score)' \
                    % (forward, backward, var)
                assert feature_with_name(features, f)
def test_make_transform_multiple_output_features(pd_es):
    def test_time(x):
        times = pd.Series(x)
        units = ["year", "month", "day", "hour", "minute", "second"]
        return [times.apply(lambda x: getattr(x, unit)) for unit in units]

    def gen_feat_names(self):
        subnames = ["Year", "Month", "Day", "Hour", "Minute", "Second"]
        return [
            "Now.%s(%s)" % (subname, self.base_features[0].get_name())
            for subname in subnames
        ]

    TestTime = make_trans_primitive(
        function=test_time,
        input_types=[ColumnSchema(logical_type=Datetime)],
        return_type=ColumnSchema(semantic_tags={'numeric'}),
        number_output_features=6,
        cls_attributes={"get_feature_names": gen_feat_names},
    )

    join_time_split = ft.Feature(pd_es["log"].ww["datetime"],
                                 primitive=TestTime)
    alt_features = [
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Year),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Month),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Day),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Hour),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Minute),
        ft.Feature(pd_es["log"].ww["datetime"], primitive=Second)
    ]
    fm, fl = ft.dfs(entityset=pd_es,
                    target_dataframe_name="log",
                    agg_primitives=['sum'],
                    trans_primitives=[
                        TestTime, Year, Month, Day, Hour, Minute, Second, Diff
                    ],
                    max_depth=5)

    subnames = join_time_split.get_feature_names()
    altnames = [f.get_name() for f in alt_features]
    for col1, col2 in zip(subnames, altnames):
        assert (fm[col1] == fm[col2]).all()

    for i in range(6):
        f = 'sessions.customers.SUM(log.TEST_TIME(datetime)[%d])' % i
        assert feature_with_name(fl, f)
        assert ('products.DIFF(SUM(log.TEST_TIME(datetime)[%d]))' % i) in fl
def test_groupby_multi_output_stacking(pd_es):
    class TestTime(TransformPrimitive):
        name = "test_time"
        input_types = [ColumnSchema(logical_type=Datetime)]
        return_type = ColumnSchema(semantic_tags={"numeric"})
        number_output_features = 6

    fl = dfs(
        entityset=pd_es,
        target_dataframe_name="sessions",
        agg_primitives=["sum"],
        groupby_trans_primitives=[TestTime],
        features_only=True,
        max_depth=4,
    )

    for i in range(6):
        f = "SUM(log.TEST_TIME(datetime)[%d] by product_id)" % i
        assert feature_with_name(fl, f)
        assert ("customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)" % i) in fl
示例#27
0
def test_groupby_multi_output_stacking(pd_es):
    TestTime = make_trans_primitive(
        function=lambda x: x,
        name="test_time",
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
    )

    fl = dfs(entityset=pd_es,
             target_entity="sessions",
             agg_primitives=['sum'],
             groupby_trans_primitives=[TestTime],
             features_only=True,
             max_depth=4)

    for i in range(6):
        f = 'SUM(log.TEST_TIME(datetime)[%d] by product_id)' % i
        assert feature_with_name(fl, f)
        assert ('customers.SUM(log.TEST_TIME(datetime)[%d] by session_id)' %
                i) in fl
示例#28
0
def test_groupby_multi_output_stacking(es):
    TestTime = make_trans_primitive(
        function=lambda x: x,
        name="test_time",
        input_types=[Datetime],
        return_type=Numeric,
        number_output_features=6,
    )

    fl = dfs(
        entityset=es,
        target_entity="sessions",
        agg_primitives=[],
        trans_primitives=[TestTime],
        groupby_trans_primitives=[CumSum],
        features_only=True,
        max_depth=4)

    for i in range(6):
        f = 'customers.CUM_SUM(TEST_TIME(upgrade_date)[%d]) by cohort' % i
        assert feature_with_name(fl, f)
        assert ('customers.CUM_SUM(TEST_TIME(date_of_birth)[%d]) by customer_id' % i) in fl
示例#29
0
def test_transform_consistency():
    # Create dataframe
    df = pd.DataFrame({'a': [14, 12, 10], 'b': [False, False, True],
                       'b1': [True, True, False], 'b12': [4, 5, 6],
                       'P': [10, 15, 12]})
    es = ft.EntitySet(id='test')
    # Add dataframe to entityset
    es.entity_from_dataframe(entity_id='first', dataframe=df,
                             index='index',
                             make_index=True)

    # Generate features
    feature_defs = ft.dfs(entityset=es, target_entity='first',
                          trans_primitives=['and', 'add_numeric', 'or'],
                          features_only=True)

    # Check for correct ordering of features
    assert feature_with_name(feature_defs, 'a')
    assert feature_with_name(feature_defs, 'b')
    assert feature_with_name(feature_defs, 'b1')
    assert feature_with_name(feature_defs, 'b12')
    assert feature_with_name(feature_defs, 'P')
    assert feature_with_name(feature_defs, 'AND(b, b1)')
    assert not feature_with_name(feature_defs, 'AND(b1, b)')  # make sure it doesn't exist the other way
    assert feature_with_name(feature_defs, 'a + P')
    assert feature_with_name(feature_defs, 'b12 + P')
    assert feature_with_name(feature_defs, 'a + b12')
    assert feature_with_name(feature_defs, 'OR(b, b1)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b)')
    assert feature_with_name(feature_defs, 'OR(AND(b, b1), b1)')