def test_compare_all_nans(es): nan_feat = Mode(es['log']['product_id'], es['sessions']) compare = nan_feat == 'brown bag' # before all data time_last = pd.Timestamp('1/1/1993') pandas_backend = PandasBackend(es, [nan_feat, compare]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=time_last) assert df[nan_feat.get_name()].dropna().shape[0] == 0 assert not df[compare.get_name()].any()
def test_make_agg_feat_multiple_dtypes(entityset, backend): compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero' agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], where=compare_prod) agg_feat2 = Mode(entityset['log']['product_id'], parent_entity=entityset['sessions'], where=compare_prod) pandas_backend = backend([agg_feat, agg_feat2]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_return_type_inference_id(es): # direct features should keep Id variable type direct_id_feature = Feature(es["sessions"]["customer_id"], es["log"]) assert direct_id_feature.variable_type == Id # aggregations of Id variable types should get converted mode = Mode(es["log"]["session_id"], es["customers"]) assert mode.variable_type == Categorical # also test direct feature of aggregation mode_direct = Feature(mode, es["sessions"]) assert mode_direct.variable_type == Categorical
def test_limit_mode_uniques(es, session_id_feat, product_id_feat, datetime_feat): mode_feat = Mode(product_id_feat, parent_entity=es['sessions']) mode_filter = filt.LimitModeUniques() assert mode_filter.is_valid(feature=mode_feat, entity=es['sessions'], target_entity_id='customers') # percent_unique is 6/15 mode_filter = filt.LimitModeUniques(threshold=.3) assert not mode_filter.is_valid( feature=mode_feat, entity=es['sessions'], target_entity_id='customers')
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Mode(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_return_type_inference_direct_feature(es): mode = Mode(es["log"]["priority_level"], es["customers"]) mode_session = Feature(mode, es["sessions"]) assert mode_session.variable_type == es["log"]["priority_level"].__class__
def test_return_type_inference(es): mode = Mode(es["log"]["priority_level"], es["customers"]) assert mode.variable_type == es["log"]["priority_level"].__class__