def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) cutoff_time = pd.DataFrame({ 'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=cutoff_time) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] cutoff_time = pd.DataFrame({ 'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_empty_child_dataframe(): parent_df = pd.DataFrame({"id": [1]}) child_df = pd.DataFrame({"id": [1, 2, 3], "parent_id": [1, 1, 1], "time_index": pd.date_range(start='1/1/2018', periods=3), "value": [10, 5, 2]}) es = ft.EntitySet(id="blah") es.entity_from_dataframe(entity_id="parent", dataframe=parent_df, index="id") es.entity_from_dataframe(entity_id="child", dataframe=child_df, index="id", time_index="time_index") es.add_relationship(ft.Relationship(es["parent"]["id"], es["child"]["parent_id"])) # create regular agg count = Count(es["child"]['id'], es["parent"]) # create agg feature that requires multiple arguments trend = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"]) # create aggs with where where = ft.Feature(es["child"]["value"]) == 1 count_where = Count(es["child"]['id'], es["parent"], where=where) trend_where = Trend([es["child"]['value'], es["child"]['time_index']], es["parent"], where=where) # cutoff time before all rows fm = ft.calculate_feature_matrix(entityset=es, features=[count, count_where, trend, trend_where], cutoff_time=pd.Timestamp("12/31/2017")) names = [count.get_name(), count_where.get_name(), trend.get_name(), trend_where.get_name()] assert_array_equal(fm[names], [[0, 0, np.nan, np.nan]]) # cutoff time after all rows, but where clause filters all rows fm2 = ft.calculate_feature_matrix(entityset=es, features=[count_where, trend_where], cutoff_time=pd.Timestamp("1/4/2018")) names = [count_where.get_name(), trend_where.get_name()] assert_array_equal(fm2[names], [[0, np.nan]])
def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=[datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)]) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_handles_training_window_correctly(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset['regions']) # make sure we test feature target entity training window feature_matrix = calculate_feature_matrix( [property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window={'customers': '36 hours'}) prop_values = [0, 5, 0] assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) feature_matrix = calculate_feature_matrix( [property_feature, dagg], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window={'log': '2 hours'}) prop_values = [5, 5, 1] dagg_values = [3, 3, 3] assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all() property_feature = Count(entityset['log']['id'], entityset['customers']) feature_matrix = calculate_feature_matrix([property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime( 2011, 4, 10, 13, 10, 1) ], training_window='2 hours') labels = [0, 0, 0] assert (feature_matrix == labels).values.all() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix( [property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window=Timedelta(2, 'observations', entity='log'))
def test_training_window(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) # for now, warns if last_time_index not present times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]}) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') entityset.add_last_time_indexes() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time, training_window=Timedelta(2, 'observations', entity='log')) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') prop_values = [5, 5, 1] dagg_values = [3, 2, 1] assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.id, where_str, use_prev_str) Count = make_agg_primitive( count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()
def test_make_agg_feat_using_prev_time(entityset, backend): agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], use_previous=Timedelta(10, 's')) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=datetime(2011, 4, 9, 10, 30, 10)) v = df[agg_feat.get_name()][0] assert (v == 2) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=datetime(2011, 4, 9, 10, 30, 30)) v = df[agg_feat.get_name()][0] assert (v == 1)
def test_make_agg_feat_of_identity_index_variable(entityset, backend): agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 5)
def test_make_agg_feat_of_grandchild_entity(entityset, backend): agg_feat = Count(entityset['log']['id'], parent_entity=entityset['customers']) pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_make_agg_feat_where_count(entityset, backend): agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], where=IdentityFeature(entityset['log']['product_id']) == 'coke zero') pandas_backend = backend([agg_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] assert (v == 3)
def test_approximate_dfeat_of_agg_on_target(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_seed_features(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features]
def test_seed_features(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last], trans_primitives=[], max_depth=2, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features]
def test_approximate_multiple_instances_per_cutoff_time(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(1, 'week'), cutoff_time=cutoff_time, chunk_size="cutoff time") assert feature_matrix.shape[0] == 2 assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0 assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_empty_path_approximate_partial(entityset): es = copy.deepcopy(entityset) es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2] agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], es, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) vals1 = feature_matrix[dfeat.get_name()].tolist() assert vals1[0] == 7 assert np.isnan(vals1[1]) assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approximate_child_aggs_handled_correctly(entityset): es = entityset agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['log']['value'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'), pd.Timestamp('2011-04-09 10:30:06')], 'instance_id': [0, 0]}) fm = calculate_feature_matrix([dfeat], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) fm_2 = calculate_feature_matrix([dfeat, agg_feat_2], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) assert fm[dfeat.get_name()].tolist() == [2, 3] assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]
def test_training_window_recent_time_index(entityset): # customer with no sessions row = { 'id': [3], 'age': [73], u'région_id': ['United States'], 'cohort': [1], 'cohort_name': ["Late Adopters"], 'loves_ice_cream': [True], 'favorite_quote': ["Who is John Galt?"], 'signup_date': [datetime(2011, 4, 10)], 'upgrade_date': [datetime(2011, 4, 12)], 'cancel_date': [datetime(2011, 5, 13)], 'date_of_birth': [datetime(1938, 2, 1)], 'engagement_level': [2], } df = pd.DataFrame(row) df.index = range(3, 4) df = entityset['customers'].df.append(df, sort=False) entityset['customers'].update_data(df=df, recalculate_last_time_indexes=False) entityset.add_last_time_indexes() property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) dagg = DirectFeature(top_level_agg, entityset['customers']) instance_ids = [0, 1, 2, 3] times = [ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59) ] cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids}) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') prop_values = [5, 5, 1, 0] dagg_values = [3, 2, 1, 3] feature_matrix.sort_index(inplace=True) assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_make_agg_feat_multiple_dtypes(entityset, backend): compare_prod = IdentityFeature(entityset['log']['product_id']) == 'coke zero' agg_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions'], where=compare_prod) agg_feat2 = Mode(entityset['log']['product_id'], parent_entity=entityset['sessions'], where=compare_prod) pandas_backend = backend([agg_feat, agg_feat2]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Count(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_training_window_recent_time_index(entityset): # customer with no sessions row = { 'id': [3], 'age': [73], u'région_id': ['United States'], 'cohort': [1], 'cancel_reason': ["I am finally awake!!"], 'loves_ice_cream': [True], 'favorite_quote': ["Who is John Galt?"], 'signup_date': [datetime(2011, 4, 10)], 'upgrade_date': [datetime(2011, 4, 12)], 'cancel_date': [datetime(2011, 5, 13)], 'date_of_birth': [datetime(1938, 2, 1)], 'engagement_level': [2], } df = pd.DataFrame(row) df.index = range(3, 4) df = entityset['customers'].df.append(df, sort=False) entityset['customers'].update_data(df=df, recalculate_last_time_indexes=False) entityset.add_last_time_indexes() property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) dagg = DirectFeature(top_level_agg, entityset['customers']) instance_ids = [0, 1, 2, 3] times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids}) feature_matrix = calculate_feature_matrix( [property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours' ) prop_values = [5, 5, 1, 0] dagg_values = [3, 2, 1, 3] feature_matrix.sort_index(inplace=True) assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_make_agg_feat_where_count_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ Count.max_stack_depth = 2 log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) feat = Count(entityset['sessions']['id'], parent_entity=entityset['customers'], where=log_count_feat > 1) pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1], time_last=None) name = feat.get_name() instances = df[name] v0, v1 = instances[0:2] assert (v0 == 2) assert (v1 == 2)
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Mode(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_make_agg_feat_where_count_or_device_type_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ Count.max_stack_depth = 2 log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature(entityset['sessions']['device_type']) == 1 or_feat = compare_count.OR(compare_device_type) feat = Count(entityset['sessions']['id'], parent_entity=entityset['customers'], where=or_feat) pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) name = feat.get_name() instances = df[name] assert (instances[0] == 3)
def test_count_null_and_make_agg_primitive(es): def count_func(values, count_null=False): if len(values) == 0: return 0 if count_null: values = values.fillna(0) return values.count() def count_generate_name(self): where_str = self._where_str() use_prev_str = self._use_prev_str() return u"COUNT(%s%s%s)" % (self.child_entity.id, where_str, use_prev_str) Count = make_agg_primitive(count_func, [[Index], [Variable]], Numeric, name="count", stack_on_self=False, cls_attributes={"generate_name": count_generate_name}) count_null = Count(es['log']['value'], es['sessions'], count_null=True) feature_matrix = ft.calculate_feature_matrix([count_null], entityset=es) values = [5, 4, 1, 2, 3, 2] assert (values == feature_matrix[count_null.get_name()]).all()