def test_pickle_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()

    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(es, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < asizeof(es)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
def test_allowed_paths(es):
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Last],
        trans_primitives=[],
        max_depth=2,
        seed_features=[]
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    features_unconstrained = dfs_unconstrained.build_features()

    unconstrained_names = [f.get_name() for f in features_unconstrained]
    customers_session_feat = Last(es['sessions']['device_type'],
                                  es['customers'])
    customers_session_log_feat = Last(es['log']['value'], es['customers'])
    assert customers_session_feat.get_name() in unconstrained_names
    assert customers_session_log_feat.get_name() in unconstrained_names

    dfs_constrained = DeepFeatureSynthesis(allowed_paths=[['customers',
                                                           'sessions']],
                                           **kwargs)
    features = dfs_constrained.build_features()
    names = [f.get_name() for f in features]
    assert customers_session_feat.get_name() in names
    assert customers_session_log_feat.get_name() not in names
def test_where_primitives(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Count, Last],
        trans_primitives=[Absolute],
        max_depth=3,
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    dfs_constrained = DeepFeatureSynthesis(where_primitives=['last'], **kwargs)
    features_unconstrained = dfs_unconstrained.build_features()
    features = dfs_constrained.build_features()

    where_feats_unconstrained = [f for f in features_unconstrained
                                 if f.where is not None]
    where_feats = [f for f in features
                   if f.where is not None]

    assert len(where_feats_unconstrained) >= 1

    assert len([f for f in where_feats_unconstrained
                if isinstance(f, Last)]) == 0
    assert len([f for f in where_feats_unconstrained
                if isinstance(f, Count)]) > 0

    assert len([f for f in where_feats
                if isinstance(f, Last)]) > 0
    assert len([f for f in where_feats
                if isinstance(f, Count)]) == 0
    assert len([d for f in where_feats
                for d in f.get_deep_dependencies()
                if isinstance(d, Absolute)]) > 0
def test_makes_agg_features(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'LAST(log.value)'))
def test_makes_trans_feat(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[Hour])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'HOUR(datetime)'))
def test_handles_time_since_previous_entity_groupby(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[TimeSincePrevious])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'time_since_previous_by_session_id'))
def test_makes_dfeatures_of_agg_primitives(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[])
    features = dfs_obj.build_features()
    assert (feature_with_name(features,
                              'customers.LAST(sessions.device_type)'))
def test_handles_cumsum_entity_groupby(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[CumMean])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, u'customers.CUM_MEAN(age by région_id)'))
def test_makes_dfeatures(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'customers.age'))
def test_makes_agg_features_of_trans_primitives(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last],
                                   trans_primitives=[Hour])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'LAST(log.HOUR(datetime))'))
def test_handles_diff_entity_groupby(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[Diff])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'DIFF(value by session_id)'))
    assert (feature_with_name(features, 'DIFF(value by product_id)'))
def test_case_insensitive(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=['MiN'],
                                   trans_primitives=['AbsOlute'])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'MIN(log.value)'))
    assert (feature_with_name(features, 'ABSOLUTE(MIN(log.value_many_nans))'))
def test_makes_agg_features_from_mixed_str(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count, 'last'],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features, 'LAST(log.value)'))
    assert (feature_with_name(features, 'COUNT(log)'))
def test_makes_agg_features_with_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features,
                              'COUNT(log WHERE priority_level = 0)'))
def test_abides_by_max_depth_param(es):
    for i in [1, 2, 3]:
        dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                       entityset=es,
                                       agg_primitives=[Last],
                                       trans_primitives=[],
                                       max_depth=i)

        features = dfs_obj.build_features()
        for f in features:
            # last feature is identity feature which doesn't count
            assert (f.get_depth() <= i + 1)
def test_seed_features_added_with_identity_features(es):
    count_sessions = Count(es['sessions']["id"], es['customers'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=2,
                                   seed_features=[count_sessions])
    features = dfs_obj.build_features()
    # this feature is meaningless because customers.COUNT(sessions) is already defined on
    # the customers entity
    assert not feature_with_name(features, 'LAST(sessions.customers.COUNT(sessions))')
def test_ignores_entities(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   ignore_entities=['log'])

    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_deep_dependencies()
        entities = [d.entity.id for d in deps]
        assert 'log' not in entities
def test_stacking_where_primitives(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]
    es['log']['product_id'].interesting_values = ["coke_zero"]
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Count, Last],
        max_depth=3,
    )
    dfs_where_stack_limit_1 = DeepFeatureSynthesis(where_primitives=['last', Count],
                                                   **kwargs)
    dfs_where_stack_limit_2 = DeepFeatureSynthesis(where_primitives=['last', Count],
                                                   where_stacking_limit=2,
                                                   **kwargs)
    stack_limit_1_features = dfs_where_stack_limit_1.build_features()
    stack_limit_2_features = dfs_where_stack_limit_2.build_features()

    where_stack_1_feats = [f for f in stack_limit_1_features
                           if f.where is not None]
    where_stack_2_feats = [f for f in stack_limit_2_features
                           if f.where is not None]

    assert len(where_stack_1_feats) >= 1
    assert len(where_stack_2_feats) >= 1

    assert len([f for f in where_stack_1_feats
                if isinstance(f, Last)]) > 0
    assert len([f for f in where_stack_1_feats
                if isinstance(f, Count)]) > 0

    assert len([f for f in where_stack_2_feats
                if isinstance(f, Last)]) > 0
    assert len([f for f in where_stack_2_feats
                if isinstance(f, Count)]) > 0

    stacked_where_limit_1_feats = []
    stacked_where_limit_2_feats = []
    where_double_where_tuples = [
        (where_stack_1_feats, stacked_where_limit_1_feats),
        (where_stack_2_feats, stacked_where_limit_2_feats)
    ]
    for where_list, double_where_list in where_double_where_tuples:
        for feature in where_list:
            for base_feat in feature.base_features:
                if base_feat.where is not None:
                    double_where_list.append(feature)

    assert len(stacked_where_limit_1_feats) == 0
    assert len(stacked_where_limit_2_feats) > 0
def test_only_makes_supplied_trans_feat(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[],
                                   trans_primitives=[Hour])

    features = dfs_obj.build_features()
    other_trans_features = [f for f in features
                            if (isinstance(f, TransformPrimitive) and
                                not isinstance(f, Hour)) or
                            len([g for g in f.base_features
                                 if isinstance(g, TransformPrimitive) and
                                 not isinstance(g, Hour)]) > 0]
    assert len(other_trans_features) == 0
def test_ignores_variables(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   ignore_variables={'log': ['value']})
    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_deep_dependencies()
        identities = [d for d in deps
                      if isinstance(d, IdentityFeature)]
        variables = [d.variable.id for d in identities
                     if d.entity.id == 'log']
        assert 'value' not in variables
def test_dfeats_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert not (feature_with_name(
        features, 'COUNT(sessions WHERE device_name = Mobile)'))
    assert not (feature_with_name(features,
                                  'COUNT(sessions WHERE device_name = PC)'))
    assert not (feature_with_name(features,
                                  'COUNT(sessions WHERE device_type = 0)'))
def test_seed_features(es):
    seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2
    seed_feature_log = Hour(es['log']['datetime'])
    session_agg = Last(seed_feature_log, es['sessions'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=2,
                                   seed_features=[seed_feature_sessions,
                                                  seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name()
                                                for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
def test_makes_agg_features_with_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   where_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()
    assert (feature_with_name(features,
                              'COUNT(log WHERE priority_level = 0)'))

    # make sure they are made using direct features too
    assert (feature_with_name(features,
                              'COUNT(log WHERE products.department = food)'))
def test_dfeats_where(es):
    es.add_interesting_values()

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Count],
                                   trans_primitives=[])

    features = dfs_obj.build_features()

    # test to make sure we build direct features of agg features with where clause
    assert (feature_with_name(
        features, 'customers.COUNT(log WHERE priority_level = 0)'))

    assert (feature_with_name(
        features, 'COUNT(log WHERE products.department = electronics)'))
def test_where_different_base_feats(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]

    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Last, Count],
        where_primitives=[Last, Count],
        max_depth=3,
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    features = dfs_unconstrained.build_features()
    where_feats = [f.hash() for f in features
                   if f.where is not None]
    not_where_feats = [f.hash() for f in features
                       if f.where is None]
    for hashed in not_where_feats:
        assert hashed not in where_feats
def test_commutative(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='log',
                                   entityset=es,
                                   agg_primitives=[Sum],
                                   trans_primitives=[Add],
                                   max_depth=3)
    feats = dfs_obj.build_features()
    num_add_feats = 0
    num_add_as_base_feat = 0

    for feat in feats:
        if isinstance(feat, Add):
            num_add_feats += 1
        for base_feat in feat.base_features:
            if isinstance(base_feat, Add):
                num_add_as_base_feat += 1

    assert num_add_feats == 3
    assert num_add_as_base_feat == 9
def test_drop_exact(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[],
                                   drop_exact=[])
    features = dfs_obj.build_features()
    to_drop = features[0]
    name = to_drop.get_name()
    dfs_drop = DeepFeatureSynthesis(target_entity_id='sessions',
                                    entityset=es,
                                    agg_primitives=[Last],
                                    trans_primitives=[],
                                    max_depth=1,
                                    seed_features=[],
                                    drop_exact=[name])
    features = dfs_drop.build_features()
    assert name not in [f.get_name() for f in features]
def test_drop_contains(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[],
                                   drop_contains=[])
    features = dfs_obj.build_features()
    to_drop = features[0]
    partial_name = to_drop.get_name()[:5]
    dfs_drop = DeepFeatureSynthesis(target_entity_id='sessions',
                                    entityset=es,
                                    agg_primitives=[Last],
                                    trans_primitives=[],
                                    max_depth=1,
                                    seed_features=[],
                                    drop_contains=[partial_name])
    features = dfs_drop.build_features()
    assert to_drop.get_name() not in [f.get_name() for f in features]
def test_ignores_entities(es):
    error_text = 'ignore_entities must be a list'
    with pytest.raises(TypeError, match=error_text):
        DeepFeatureSynthesis(target_entity_id='sessions',
                             entityset=es,
                             agg_primitives=[Last],
                             trans_primitives=[],
                             ignore_entities='log')

    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   ignore_entities=['log'])

    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_deep_dependencies()
        entities = [d.entity.id for d in deps]
        assert 'log' not in entities
def test_where_primitives(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Count, Last],
        trans_primitives=[Absolute],
        max_depth=3,
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    dfs_constrained = DeepFeatureSynthesis(where_primitives=['last'], **kwargs)
    features_unconstrained = dfs_unconstrained.build_features()
    features = dfs_constrained.build_features()

    where_feats_unconstrained = [
        f for f in features_unconstrained if f.where is not None
    ]
    where_feats = [f for f in features if f.where is not None]

    assert len(where_feats_unconstrained) >= 1

    assert len([f for f in where_feats_unconstrained
                if isinstance(f, Last)]) == 0
    assert len([f for f in where_feats_unconstrained if isinstance(f, Count)
                ]) > 0

    assert len([f for f in where_feats if isinstance(f, Last)]) > 0
    assert len([f for f in where_feats if isinstance(f, Count)]) == 0
    assert len([
        d for f in where_feats
        for d in f.get_deep_dependencies() if isinstance(d, Absolute)
    ]) > 0
def test_allowed_paths(es):
    kwargs = dict(target_entity_id='customers',
                  entityset=es,
                  agg_primitives=[Last],
                  trans_primitives=[],
                  max_depth=2,
                  seed_features=[])
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    features_unconstrained = dfs_unconstrained.build_features()

    unconstrained_names = [f.get_name() for f in features_unconstrained]
    customers_session_feat = ft.Feature(es['sessions']['device_type'],
                                        parent_entity=es['customers'],
                                        primitive=Last)
    customers_session_log_feat = ft.Feature(es['log']['value'],
                                            parent_entity=es['customers'],
                                            primitive=Last)
    assert customers_session_feat.get_name() in unconstrained_names
    assert customers_session_log_feat.get_name() in unconstrained_names

    dfs_constrained = DeepFeatureSynthesis(
        allowed_paths=[['customers', 'sessions']], **kwargs)
    features = dfs_constrained.build_features()
    names = [f.get_name() for f in features]
    assert customers_session_feat.get_name() in names
    assert customers_session_log_feat.get_name() not in names
def test_return_variable_types(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id="sessions",
                                   entityset=es,
                                   agg_primitives=[Count, NMostCommon],
                                   trans_primitives=[Absolute, Hour, IsIn])

    discrete = ft.variable_types.Discrete
    numeric = ft.variable_types.Numeric
    datetime = ft.variable_types.Datetime

    f1 = dfs_obj.build_features(return_variable_types=None)
    f2 = dfs_obj.build_features(return_variable_types=[discrete])
    f3 = dfs_obj.build_features(return_variable_types="all")
    f4 = dfs_obj.build_features(return_variable_types=[datetime])

    f1_types = set([f.variable_type for f in f1])
    f2_types = set([f.variable_type for f in f2])
    f3_types = set([f.variable_type for f in f3])
    f4_types = set([f.variable_type for f in f4])

    assert (discrete in f1_types)
    assert (numeric in f1_types)
    assert (datetime not in f2_types)

    assert (discrete in f2_types)
    assert (numeric not in f2_types)
    assert (datetime not in f2_types)

    assert (discrete in f3_types)
    assert (numeric in f3_types)
    assert (datetime in f3_types)

    assert (discrete not in f4_types)
    assert (numeric not in f4_types)
    assert (datetime in f4_types)
示例#34
0
def test_max_features(es):
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Last],
        trans_primitives=[],
        max_depth=2,
        seed_features=[]
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    features_unconstrained = dfs_unconstrained.build_features()
    dfs_unconstrained_with_arg = DeepFeatureSynthesis(max_features=-1,
                                                      **kwargs)
    feats_unconstrained_with_arg = dfs_unconstrained_with_arg.build_features()
    dfs_constrained = DeepFeatureSynthesis(max_features=1, **kwargs)
    features = dfs_constrained.build_features()
    assert len(features_unconstrained) == len(feats_unconstrained_with_arg)
    assert len(features) == 1
示例#35
0
def test_only_makes_supplied_agg_feat(es):
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        max_depth=3,
    )
    dfs_obj = DeepFeatureSynthesis(agg_primitives=[Last], **kwargs)

    features = dfs_obj.build_features()

    def find_other_agg_features(features):
        return [
            f for f in features
            if (isinstance(f, AggregationPrimitive)
                and not isinstance(f, Last)) or len([
                    g for g in f.base_features if isinstance(
                        g, AggregationPrimitive) and not isinstance(g, Last)
                ]) > 0
        ]

    other_agg_features = find_other_agg_features(features)
    assert len(other_agg_features) == 0
示例#36
0
def test_dfs_builds_on_seed_features_more_than_max_depth(es):
    seed_feature_sessions = ft.Feature(es['log']["id"], parent_entity=es['sessions'], primitive=Count)
    seed_feature_log = ft.Feature(es['log']['datetime'], primitive=Hour)
    session_agg = ft.Feature(seed_feature_log, parent_entity=es['sessions'], primitive=Last)

    # Depth of this feat is 2 relative to session_agg, the seed feature,
    # which is greater than max_depth so it shouldn't be built
    session_agg_trans = DirectFeature(ft.Feature(session_agg, parent_entity=es['customers'], primitive=Mode),
                                      es['sessions'])
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last, Count],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[seed_feature_sessions,
                                                  seed_feature_log])
    features = dfs_obj.build_features()
    assert seed_feature_sessions.get_name() in [f.get_name()
                                                for f in features]
    assert session_agg.get_name() in [f.get_name() for f in features]
    assert session_agg_trans.get_name() not in [f.get_name()
                                                for f in features]
def test_where_different_base_feats(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]

    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Last, Count],
        where_primitives=[Last, Count],
        max_depth=3,
    )
    dfs_unconstrained = DeepFeatureSynthesis(**kwargs)
    features = dfs_unconstrained.build_features()
    where_feats = [
        f.hash() for f in features
        if isinstance(f, AggregationFeature) and f.where is not None
    ]
    not_where_feats = [
        f.hash() for f in features
        if isinstance(f, AggregationFeature) and f.where is None
    ]
    for hashed in not_where_feats:
        assert hashed not in where_feats
def test_pickle_features_with_custom_primitive(es):
    NewMean = make_agg_primitive(
        np.nanmean,
        name="NewMean",
        input_types=[Numeric],
        return_type=Numeric,
        description="Calculate means ignoring nan values")
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   filters=[],
                                   agg_primitives=[Last, Mean, NewMean],
                                   trans_primitives=[],
                                   max_features=20)

    features_no_pickle = dfs_obj.build_features()
    assert any([isinstance(feat, NewMean) for feat in features_no_pickle])
    dir_path = os.path.dirname(os.path.realpath(__file__))
    filepath = os.path.join(dir_path, 'test_feature')
    es_filepath = os.path.join(dir_path, 'test_entityset')

    # pickle entityset
    save_obj_pickle(features_no_pickle[0].entityset, es_filepath)

    ft.save_features(features_no_pickle, filepath)
    features_pickle = ft.load_features(filepath, es)
    for feat_1, feat_2 in zip(features_no_pickle, features_pickle):
        assert feat_1.hash() == feat_2.hash()
        assert feat_1.entityset == feat_2.entityset

    # file is smaller than entityset in memory
    assert os.path.getsize(filepath) < getsize(feat_1.entityset)

    # file is smaller than entityset pickled
    assert os.path.getsize(filepath) < os.path.getsize(es_filepath)
    os.remove(filepath)
    os.remove(es_filepath)
示例#39
0
def test_allow_where(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]
    Count.allow_where = False
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Count, Last],
        max_depth=3,
    )
    dfs_constrained = DeepFeatureSynthesis(where_primitives=[Count, Last],
                                           **kwargs)
    features = dfs_constrained.build_features()

    # change it back after building features
    Count.allow_where = True

    where_feats = [f for f in features
                   if isinstance(f, AggregationFeature) and f.where is not None]

    assert len([f for f in where_feats
                if isinstance(f.primitive, Last)]) > 0
    assert len([f for f in where_feats
                if isinstance(f.primitive, Count)]) == 0
def test_stacking_where_primitives(es):
    es = copy.deepcopy(es)
    es['sessions']['device_type'].interesting_values = [0]
    es['log']['product_id'].interesting_values = ["coke_zero"]
    kwargs = dict(
        target_entity_id='customers',
        entityset=es,
        agg_primitives=[Count, Last],
        max_depth=3,
    )
    dfs_where_stack_limit_1 = DeepFeatureSynthesis(
        where_primitives=['last', Count], **kwargs)
    dfs_where_stack_limit_2 = DeepFeatureSynthesis(
        where_primitives=['last', Count], where_stacking_limit=2, **kwargs)
    stack_limit_1_features = dfs_where_stack_limit_1.build_features()
    stack_limit_2_features = dfs_where_stack_limit_2.build_features()

    where_stack_1_feats = [
        f for f in stack_limit_1_features
        if isinstance(f, AggregationFeature) and f.where is not None
    ]
    where_stack_2_feats = [
        f for f in stack_limit_2_features
        if isinstance(f, AggregationFeature) and f.where is not None
    ]

    assert len(where_stack_1_feats) >= 1
    assert len(where_stack_2_feats) >= 1

    assert len(
        [f for f in where_stack_1_feats if isinstance(f.primitive, Last)]) > 0
    assert len(
        [f for f in where_stack_1_feats if isinstance(f.primitive, Count)]) > 0

    assert len(
        [f for f in where_stack_2_feats if isinstance(f.primitive, Last)]) > 0
    assert len(
        [f for f in where_stack_2_feats if isinstance(f.primitive, Count)]) > 0

    stacked_where_limit_1_feats = []
    stacked_where_limit_2_feats = []
    where_double_where_tuples = [
        (where_stack_1_feats, stacked_where_limit_1_feats),
        (where_stack_2_feats, stacked_where_limit_2_feats)
    ]
    for where_list, double_where_list in where_double_where_tuples:
        for feature in where_list:
            for base_feat in feature.base_features:
                if isinstance(
                        base_feat,
                        AggregationFeature) and base_feat.where is not None:
                    double_where_list.append(feature)

    assert len(stacked_where_limit_1_feats) == 0
    assert len(stacked_where_limit_2_feats) > 0
def test_max_hlevel(es):
    kwargs = dict(
        target_entity_id='log',
        entityset=es,
        agg_primitives=[Count, Last],
        trans_primitives=[Hour],
        max_depth=-1,
    )

    dfs_h_n1 = DeepFeatureSynthesis(max_hlevel=-1, **kwargs)
    dfs_h_0 = DeepFeatureSynthesis(max_hlevel=0, **kwargs)
    dfs_h_1 = DeepFeatureSynthesis(max_hlevel=1, **kwargs)
    feats_n1 = dfs_h_n1.build_features()
    feats_n1 = [f.get_name() for f in feats_n1]
    feats_0 = dfs_h_0.build_features()
    feats_0 = [f.get_name() for f in feats_0]
    feats_1 = dfs_h_1.build_features()
    feats_1 = [f.get_name() for f in feats_1]

    customer_log = ft.Feature(es['log']['value'],
                              parent_entity=es['customers'],
                              primitive=Last)
    session_log = ft.Feature(es['log']['value'],
                             parent_entity=es['sessions'],
                             primitive=Last)
    log_customer_log = ft.Feature(ft.Feature(customer_log, es["sessions"]),
                                  es['log'])
    log_session_log = ft.Feature(session_log, es['log'])
    assert log_customer_log.get_name() in feats_n1
    assert log_session_log.get_name() in feats_n1

    assert log_customer_log.get_name() not in feats_1
    assert log_session_log.get_name() in feats_1

    assert log_customer_log.get_name() not in feats_0
    assert log_session_log.get_name() not in feats_0
示例#42
0
def test_primitive_options(es):
    options = {'sum': {'include_variables': {'customers': ['age']}},
               'mean': {'include_entities': ['customers']},
               'mode': {'ignore_entities': ['sessions']},
               'num_unique': {'ignore_variables': {'customers': ['engagement_level']}}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts',
                                   entityset=es,
                                   primitive_options=options)
    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_dependencies()
        entities = [d.entity.id for d in deps]
        identities = [d for d in deps if isinstance(d, IdentityFeature)]
        variables = [d.variable.id for d in identities]
        if isinstance(f.primitive, Sum):
            if 'customers' in entities:
                assert 'age' in variables or variables == []
        if isinstance(f.primitive, Mean):
            assert 'customers' in entities
        if isinstance(f.primitive, Mode):
            assert 'sessions' not in entities
        if isinstance(f.primitive, NumUnique):
            if 'customers' in entities:
                assert 'engagement_level' not in variables

    options = {'month': {'ignore_variables': {'customers': ['date_of_birth']}},
               'day': {'include_variables': {'customers': ['signup_date', 'upgrade_date']}},
               'num_characters': {'ignore_entities': ['customers']},
               'year': {'include_entities': ['customers']}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='customers',
                                   entityset=es,
                                   agg_primitives=[],
                                   ignore_entities=['cohort'],
                                   primitive_options=options)
    features = dfs_obj.build_features()
    assert not any([isinstance(f, NumCharacters) for f in features])
    for f in features:
        deps = f.get_dependencies()
        entities = [d.entity.id for d in deps]
        identities = [d for d in deps if isinstance(d, IdentityFeature)]
        variables = [d.variable.id for d in identities]
        if isinstance(f.primitive, Month):
            if 'customers' in entities:
                assert 'date_of_birth' not in variables
        if isinstance(f.primitive, Day):
            if 'customers' in entities:
                assert 'signup_date' in variables or 'upgrade_date' in variables
        if isinstance(f.primitive, Year):
            assert 'customers' in entities
示例#43
0
def test_drop_exact(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[],
                                   drop_exact=[])
    features = dfs_obj.build_features()
    to_drop = features[0]
    name = to_drop.get_name()
    dfs_drop = DeepFeatureSynthesis(target_entity_id='sessions',
                                    entityset=es,
                                    agg_primitives=[Last],
                                    trans_primitives=[],
                                    max_depth=1,
                                    seed_features=[],
                                    drop_exact=[name])
    features = dfs_drop.build_features()
    assert name not in [f.get_name() for f in features]
示例#44
0
def test_drop_contains(es):
    dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions',
                                   entityset=es,
                                   agg_primitives=[Last],
                                   trans_primitives=[],
                                   max_depth=1,
                                   seed_features=[],
                                   drop_contains=[])
    features = dfs_obj.build_features()
    to_drop = features[0]
    partial_name = to_drop.get_name()[:5]
    dfs_drop = DeepFeatureSynthesis(target_entity_id='sessions',
                                    entityset=es,
                                    agg_primitives=[Last],
                                    trans_primitives=[],
                                    max_depth=1,
                                    seed_features=[],
                                    drop_contains=[partial_name])
    features = dfs_drop.build_features()
    assert to_drop.get_name() not in [f.get_name() for f in features]
示例#45
0
def test_primitive_options_errors(es):
    wrong_key_options = {'mode': {'ignore_entity': ['sessions']}}
    wrong_type_list = {'mode': {'ignore_entities': 'sessions'}}
    wrong_type_dict = {'mode':
                       {'ignore_variables': {'sessions': 'product_id'}}}
    conflicting_primitive_options = {('count', 'mode'):
                                     {'ignore_entities': ['sessions']},
                                     'mode': {'include_entities': ['sessions']}}
    invalid_entity = {'mode': {'include_entities': ['invalid_entity']}}
    invalid_variable_entity = {'mode': {'include_variables': {'invalid_entity': ['product_id']}}}
    invalid_variable = {'mode': {'include_variables': {'sessions': ['invalid_variable']}}}
    key_error_text = "Unrecognized primitive option \'ignore_entity\' for mode"
    list_error_text = "Incorrect type formatting for \'ignore_entities\' for mode"
    dict_error_text = "Incorrect type formatting for \'ignore_variables\' for mode"
    conflicting_error_text = "Multiple options found for primitive mode"
    invalid_entity_warning = "Entity \'invalid_entity\' not in entityset"
    invalid_variable_warning = "Variable \'invalid_variable\' not in entity \'sessions\'"
    with pytest.raises(KeyError, match=key_error_text):
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=wrong_key_options)
    with pytest.raises(TypeError, match=list_error_text):
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=wrong_type_list)
    with pytest.raises(TypeError, match=dict_error_text):
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=wrong_type_dict)
    with pytest.raises(KeyError, match=conflicting_error_text):
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=conflicting_primitive_options)
    with pytest.warns(UserWarning, match=invalid_entity_warning) as record:
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=invalid_entity)
    assert len(record) == 1
    with pytest.warns(UserWarning, match=invalid_entity_warning) as record:
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=invalid_variable_entity)
    assert len(record) == 1
    with pytest.warns(UserWarning, match=invalid_variable_warning) as record:
        DeepFeatureSynthesis(target_entity_id='customers',
                             entityset=es,
                             agg_primitives=['mode'],
                             trans_primitives=[],
                             primitive_options=invalid_variable)
    assert len(record) == 1
示例#46
0
def test_primitive_options_with_globals(es):
    # non-overlapping ignore_entities
    options = {'mode': {'ignore_entities': ['sessions']}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts',
                                   entityset=es,
                                   ignore_entities=[u'régions'],
                                   primitive_options=options)
    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_dependencies(deep=True)
        entities = [d.entity.id for d in deps]
        assert u'régions' not in entities
        if isinstance(f.primitive, Mode):
            assert 'sessions' not in entities

    # non-overlapping ignore_variables
    options = {'num_unique': {'ignore_variables': {'customers': ['engagement_level']}}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts',
                                   entityset=es,
                                   ignore_variables={'customers': [u'région_id']},
                                   primitive_options=options)
    features = dfs_obj.build_features()
    for f in features:
        deps = f.get_dependencies()
        entities = [d.entity.id for d in deps]
        identities = [d for d in deps if isinstance(d, IdentityFeature)]
        variables = [d.variable.id for d in identities]
        if 'customers' in entities:
            assert u'region_id' not in variables
        if isinstance(f.primitive, NumUnique):
            if 'customers' in entities:
                assert 'engagement_level' not in variables

    # Overlapping globals/options with ignore_entities
    options = {'mode': {'include_entities': ['sessions', 'customers'],
                        'ignore_variables': {'customers': [u'région_id']}},
               'num_unique': {'include_entities': ['sessions', 'customers'],
                              'include_variables': {'sessions': ['device_type']}},
               'month': {'ignore_variables': {'cohorts': ['cohort_end']}}}
    dfs_obj = DeepFeatureSynthesis(target_entity_id='cohorts',
                                   entityset=es,
                                   ignore_entities=['sessions'],
                                   ignore_variables={'customers': ['age']},
                                   primitive_options=options)
    features = dfs_obj.build_features()
    for f in features:
        assert f.primitive.name != 'month'
        # ignoring cohorts means no features are created
        assert not isinstance(f.primitive, Month)

        deps = f.get_dependencies()
        entities = [d.entity.id for d in deps]
        identities = [d for d in deps if isinstance(d, IdentityFeature)]
        variables = [d.variable.id for d in identities]
        if isinstance(f.primitive, Mode):
            assert 'sessions' in entities or 'customers' in entities
            if 'customers' in entities:
                assert 'age' not in variables
                assert u'région_id' not in variables
        elif isinstance(f.primitive, NumUnique):
            assert 'sessions' in entities or 'customers' in entities
            if 'sessions' in entities:
                assert 'device_type' in variables or variables == []
        # All other primitives ignore 'sessions' and 'age'
        else:
            assert 'sessions' not in entities
            if 'customers' in entities:
                assert 'age' not in variables