def test_training_window(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) # for now, warns if last_time_index not present times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 1, 2]}) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') entityset.add_last_time_indexes() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix([property_feature], entityset, cutoff_time=cutoff_time, training_window=Timedelta(2, 'observations', entity='log')) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') prop_values = [5, 5, 1] dagg_values = [3, 2, 1] assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=cutoff_time) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_approximate_dfeat_of_need_all_values(entityset): es = entityset p = Percentile(es['log']['value']) agg_feat = Sum(p, es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) log_df = es['log'].df instances = [0, 2] cutoffs = [pd.Timestamp('2011-04-09 10:31:19'), pd.Timestamp('2011-04-09 11:00:00')] approxes = [pd.Timestamp('2011-04-09 10:31:10'), pd.Timestamp('2011-04-09 11:00:00')] true_vals = [] true_vals_approx = [] for instance, cutoff, approx in zip(instances, cutoffs, approxes): log_data_cutoff = log_df[log_df['datetime'] < cutoff] log_data_cutoff['percentile'] = log_data_cutoff['value'].rank(pct=True) true_agg = log_data_cutoff.loc[log_data_cutoff['session_id'] == instance, 'percentile'].fillna(0).sum() true_vals.append(round(true_agg, 3)) log_data_approx = log_df[log_df['datetime'] < approx] log_data_approx['percentile'] = log_data_approx['value'].rank(pct=True) true_agg_approx = log_data_approx.loc[log_data_approx['session_id'].isin([0, 1, 2]), 'percentile'].fillna(0).sum() true_vals_approx.append(round(true_agg_approx, 3)) lapprox = [round(x, 3) for x in feature_matrix[dfeat.get_name()].tolist()] test_list = [round(x, 3) for x in feature_matrix[agg_feat.get_name()].tolist()] assert lapprox == true_vals_approx assert test_list == true_vals
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(entityset['stores']['id']) store_count_feat = Count(store_id_feat, parent_entity=entityset['regions']) num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) cutoff_time = pd.DataFrame({ 'time': [datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=cutoff_time) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] cutoff_time = pd.DataFrame({ 'time': [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)], 'instance_id': [0, 2] }) feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_arithmetic_of_direct(es): rating = es['products']['rating'] log_rating = DirectFeature(rating, child_entity=es['log']) customer_age = es['customers']['age'] session_age = DirectFeature(customer_age, child_entity=es['sessions']) log_age = DirectFeature(session_age, child_entity=es['log']) to_test = [(Add, [38, 37, 37.5, 37.5]), (Subtract, [28, 29, 28.5, 28.5]), (Multiply, [165, 132, 148.5, 148.5]), (Divide, [6.6, 8.25, 22. / 3, 22. / 3])] features = [] for test in to_test: features.append(test[0](log_age, log_rating)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 3, 5, 7], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]
def test_make_dfeat_of_agg_feat_through_parent(entityset, backend): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(entityset['stores']['id']) store_count_feat = Count(store_id_feat, parent_entity=entityset[u'régions']) num_stores_feat = DirectFeature(store_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_stores_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_cfm_no_cutoff_time_index(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat4 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat4, es['sessions']) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(12, 's'), cutoff_time=[datetime(2013, 4, 9, 10, 31, 19), datetime(2013, 4, 9, 11, 0, 0)]) assert feature_matrix.index.name == 'id' assert feature_matrix.index.values.tolist() == [0, 2] assert feature_matrix[dfeat.get_name()].tolist() == [10, 10] assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1] feature_matrix_2 = calculate_feature_matrix([dfeat, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=False, approximate=Timedelta(10, 's'), cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_2.index.name == 'id' assert feature_matrix_2.index.tolist() == [0, 2] assert feature_matrix_2[dfeat.get_name()].tolist() == [7, 10] assert feature_matrix_2[agg_feat.get_name()].tolist() == [5, 1]
def test_handles_training_window_correctly(entityset): property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset['regions']) # make sure we test feature target entity training window feature_matrix = calculate_feature_matrix( [property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window={'customers': '36 hours'}) prop_values = [0, 5, 0] assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() # make sure features that have a direct to a higher level agg # so we have multiple "filter eids" in get_pandas_data_slice, # and we go through the loop to pull data with a training_window param more than once dagg = DirectFeature(top_level_agg, entityset['customers']) feature_matrix = calculate_feature_matrix( [property_feature, dagg], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window={'log': '2 hours'}) prop_values = [5, 5, 1] dagg_values = [3, 3, 3] assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all() property_feature = Count(entityset['log']['id'], entityset['customers']) feature_matrix = calculate_feature_matrix([property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime( 2011, 4, 10, 13, 10, 1) ], training_window='2 hours') labels = [0, 0, 0] assert (feature_matrix == labels).values.all() with pytest.raises(AssertionError): feature_matrix = calculate_feature_matrix( [property_feature], instance_ids=[0, 1, 2], cutoff_time=[ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1) ], training_window=Timedelta(2, 'observations', entity='log'))
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) copy_feat = feat.rename("session_test") assert feat.hash() != copy_feat.hash() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0].generate_name() == copy_feat.base_features[0].generate_name() assert feat.entity == copy_feat.entity
def test_direct_from_variable(es): # should be same behavior as test_direct_from_identity d = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) pandas_backend = PandasBackend(es, [d]) df = pandas_backend.calculate_all_features(instance_ids=[0, 5], time_last=None) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_make_dfeat(entityset, backend): f = DirectFeature(entityset['customers']['age'], child_entity=entityset['sessions']) pandas_backend = backend([f]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[f.get_name()][0] assert (v == 33)
def test_direct_rename(es): # should be same behavior as test_direct_from_identity feat = DirectFeature(base_feature=es['sessions']['device_type'], child_entity=es['log']) copy_feat = feat.rename("session_test") assert feat.hash() != copy_feat.hash() assert feat.get_name() != copy_feat.get_name() assert feat.base_features[0]._get_name( ) == copy_feat.base_features[0]._get_name() assert feat.entity == copy_feat.entity
def test_approximate_dfeat_of_dfeat_of_agg_on_target(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['log']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) assert feature_matrix[dfeat.get_name()].tolist() == [7, 10]
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], entityset, instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_approximate_returns_correct_empty_default_values(entityset): es = entityset agg_feat = Count(es['log']['id'], es['customers']) dfeat = DirectFeature(agg_feat, es['sessions']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 11:00:00'), pd.Timestamp('2011-04-09 11:00:00')], 'instance_id': [0, 0]}) fm = calculate_feature_matrix([dfeat], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) assert fm[dfeat.get_name()].tolist() == [0, 10]
def test_uses_full_entity_feat_of_approximate(entityset): es = entityset agg_feat = Sum(es['log']['value'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) agg_feat3 = Min(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) dfeat2 = DirectFeature(agg_feat3, es['sessions']) p = Percentile(dfeat) # only dfeat2 should be approximated # because Percentile needs all values feature_matrix_only_dfeat2 = calculate_feature_matrix( [dfeat2], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == [1, 0] feature_matrix_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 's'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) assert feature_matrix_only_dfeat2[dfeat2.get_name()].tolist() == feature_matrix_approx[dfeat2.get_name()].tolist() feature_matrix_small_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], approximate=Timedelta(10, 'ms'), cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) feature_matrix_no_approx = calculate_feature_matrix( [p, dfeat, dfeat2, agg_feat], instance_ids=[0, 2], cutoff_time_in_index=True, cutoff_time=[datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)]) for f in [p, dfeat, agg_feat]: for fm1, fm2 in combinations([feature_matrix_approx, feature_matrix_small_approx, feature_matrix_no_approx], 2): assert fm1[f.get_name()].tolist() == fm2[f.get_name()].tolist()
def test_approximate_multiple_instances_per_cutoff_time(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(1, 'week'), cutoff_time=cutoff_time, chunk_size="cutoff time") assert feature_matrix.shape[0] == 2 assert feature_matrix[dfeat.get_name()].dropna().shape[0] == 0 assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(entityset, backend): """ The graph looks like this (higher implies parent): C C = Customers, the entity we're trying to predict on | S = Sessions, a child of Customers P S L = Log, a child of both Sessions and Log \ / P = Products, a parent of Log which is not a descendent of customers L We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and then aggregate it with another agg_feat of C on L. """ log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['products']) product_purchases_feat = DirectFeature(log_count_feat, child_entity=entityset['log']) purchase_popularity = Mean(product_purchases_feat, parent_entity=entityset['customers']) pandas_backend = backend([purchase_popularity]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[purchase_popularity.get_name()][0] assert (v == 38.0 / 10.0)
def test_cutoff_time_naming(entityset): es = entityset agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({ 'time': [ pd.Timestamp('2011-04-08 10:30:00'), pd.Timestamp('2011-04-09 10:30:06') ], 'instance_id': [0, 0] }) cutoff_df_index_name = cutoff_df.rename(columns={"instance_id": "id"}) cutoff_df_time_name = cutoff_df.rename(columns={"time": "cutoff_time"}) cutoff_df_index_name_time_name = cutoff_df.rename(columns={ "instance_id": "id", "time": "cutoff_time" }) cutoff_df_wrong_index_name = cutoff_df.rename( columns={"instance_id": "wrong_id"}) fm1 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df) for test_cutoff in [ cutoff_df_index_name, cutoff_df_time_name, cutoff_df_index_name_time_name ]: fm2 = calculate_feature_matrix([dfeat], cutoff_time=test_cutoff) assert all((fm1 == fm2.values).values) with pytest.raises(AttributeError): calculate_feature_matrix([dfeat], cutoff_time=cutoff_df_wrong_index_name)
def test_approximate_time_split_returns_the_same_result(entityset): es = entityset agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:07:30'), pd.Timestamp('2011-04-09 10:07:40')], 'instance_id': [0, 0]}) feature_matrix_at_once = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) divided_matrices = [] separate_cutoff = [cutoff_df.iloc[0:1], cutoff_df.iloc[1:]] # Make sure indexes are different # Not that this step is unecessary and done to showcase the issue here separate_cutoff[0].index = [0] separate_cutoff[1].index = [1] for ct in separate_cutoff: fm = calculate_feature_matrix([dfeat, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=ct) divided_matrices.append(fm) feature_matrix_from_split = pd.concat(divided_matrices) assert feature_matrix_from_split.shape == feature_matrix_at_once.shape for i1, i2 in zip(feature_matrix_at_once.index, feature_matrix_from_split.index): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2) for c in feature_matrix_from_split: for i1, i2 in zip(feature_matrix_at_once[c], feature_matrix_from_split[c]): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_cutoff_time_extra_columns(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], 'instance_id': [0, 1, 0], 'label': [True, True, False]}, columns=['time', 'instance_id', 'label']) fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df) # check column was added to end of matrix assert 'label' == fm.columns[-1] # check column was sorted by time labelike the rest of the feature matrix true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm['label'] == true_series).all() fm_2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, approximate="2 days") # check column was added to end of matrix assert 'label' in fm_2.columns # check column was sorted by time like the rest of the feature matrix true_series = pd.Series([False, True, True], index=[0, 1, 0]) assert (fm_2['label'] == true_series).all()
def test_make_compare_feat(entityset, backend): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ Count.max_stack_depth = 2 log_count_feat = Count(entityset['log']['id'], parent_entity=entityset['sessions']) mean_agg_feat = Mean(log_count_feat, parent_entity=entityset['customers']) mean_feat = DirectFeature(mean_agg_feat, child_entity=entityset['sessions']) feat = log_count_feat > mean_feat pandas_backend = backend([feat]) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2], time_last=None) name = feat.get_name() instances = df[name] v0, v1, v2 = instances[0:3] assert v0 assert v1 assert not v2
def test_diff(es): value = IdentityFeature(es['log']['value']) customer_id_feat = \ DirectFeature(es['sessions']['customer_id'], child_entity=es['log']) diff1 = Diff(value, es['log']['session_id']) diff2 = Diff(value, customer_id_feat) pandas_backend = PandasBackend(es, [diff1, diff2]) df = pandas_backend.calculate_all_features(instance_ids=range(15), time_last=None) val1 = df[diff1.get_name()].values.tolist() val2 = df[diff2.get_name()].values.tolist() correct_vals1 = [ np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7 ] correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7] for i, v in enumerate(val1): v1 = val1[i] if np.isnan(v1): assert (np.isnan(correct_vals1[i])) else: assert v1 == correct_vals1[i] v2 = val2[i] if np.isnan(v2): assert (np.isnan(correct_vals2[i])) else: assert v2 == correct_vals2[i]
def test_empty_path_approximate_partial(entityset): es = copy.deepcopy(entityset) es['sessions'].df['customer_id'] = [0, 0, np.nan, 1, 1, 2] agg_feat = Count(es['log']['id'], es['sessions']) agg_feat2 = Sum(agg_feat, es['customers']) dfeat = DirectFeature(agg_feat2, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([dfeat, agg_feat], es, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) vals1 = feature_matrix[dfeat.get_name()].tolist() assert vals1[0] == 7 assert np.isnan(vals1[1]) assert feature_matrix[agg_feat.get_name()].tolist() == [5, 1]
def test_approximate_child_aggs_handled_correctly(entityset): es = entityset agg_feat = Count(es['customers']['id'], es['regions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['log']['value'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-08 10:30:00'), pd.Timestamp('2011-04-09 10:30:06')], 'instance_id': [0, 0]}) fm = calculate_feature_matrix([dfeat], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) fm_2 = calculate_feature_matrix([dfeat, agg_feat_2], approximate=Timedelta(10, 's'), cutoff_time=cutoff_df) assert fm[dfeat.get_name()].tolist() == [2, 3] assert fm_2[agg_feat_2.get_name()].tolist() == [0, 2]
def test_approx_base_feature_is_also_first_class_feature(entityset): es = entityset log_to_products = DirectFeature(es['products']['rating'], es['log']) # This should still be computed properly agg_feat = Min(log_to_products, es['sessions']) customer_agg_feat = Sum(agg_feat, es['customers']) # This is to be approximated sess_to_cust = DirectFeature(customer_agg_feat, es['sessions']) times = [datetime(2011, 4, 9, 10, 31, 19), datetime(2011, 4, 9, 11, 0, 0)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': [0, 2]}) feature_matrix = calculate_feature_matrix([sess_to_cust, agg_feat], entityset, approximate=Timedelta(10, 's'), cutoff_time=cutoff_time) vals1 = feature_matrix[sess_to_cust.get_name()].tolist() assert vals1 == [8.5, 7] vals2 = feature_matrix[agg_feat.get_name()].tolist() assert vals2 == [4, 1.5]
def test_training_window_recent_time_index(entityset): # customer with no sessions row = { 'id': [3], 'age': [73], u'région_id': ['United States'], 'cohort': [1], 'cohort_name': ["Late Adopters"], 'loves_ice_cream': [True], 'favorite_quote': ["Who is John Galt?"], 'signup_date': [datetime(2011, 4, 10)], 'upgrade_date': [datetime(2011, 4, 12)], 'cancel_date': [datetime(2011, 5, 13)], 'date_of_birth': [datetime(1938, 2, 1)], 'engagement_level': [2], } df = pd.DataFrame(row) df.index = range(3, 4) df = entityset['customers'].df.append(df, sort=False) entityset['customers'].update_data(df=df, recalculate_last_time_indexes=False) entityset.add_last_time_indexes() property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) dagg = DirectFeature(top_level_agg, entityset['customers']) instance_ids = [0, 1, 2, 3] times = [ datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59) ] cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids}) feature_matrix = calculate_feature_matrix([property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours') prop_values = [5, 5, 1, 0] dagg_values = [3, 2, 1, 3] feature_matrix.sort_index(inplace=True) assert (feature_matrix[property_feature.get_name()] == prop_values ).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Count(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis( target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_training_window_recent_time_index(entityset): # customer with no sessions row = { 'id': [3], 'age': [73], u'région_id': ['United States'], 'cohort': [1], 'cancel_reason': ["I am finally awake!!"], 'loves_ice_cream': [True], 'favorite_quote': ["Who is John Galt?"], 'signup_date': [datetime(2011, 4, 10)], 'upgrade_date': [datetime(2011, 4, 12)], 'cancel_date': [datetime(2011, 5, 13)], 'date_of_birth': [datetime(1938, 2, 1)], 'engagement_level': [2], } df = pd.DataFrame(row) df.index = range(3, 4) df = entityset['customers'].df.append(df, sort=False) entityset['customers'].update_data(df=df, recalculate_last_time_indexes=False) entityset.add_last_time_indexes() property_feature = Count(entityset['log']['id'], entityset['customers']) top_level_agg = Count(entityset['customers']['id'], entityset[u'régions']) dagg = DirectFeature(top_level_agg, entityset['customers']) instance_ids = [0, 1, 2, 3] times = [datetime(2011, 4, 9, 12, 31), datetime(2011, 4, 10, 11), datetime(2011, 4, 10, 13, 10, 1), datetime(2011, 4, 10, 1, 59, 59)] cutoff_time = pd.DataFrame({'time': times, 'instance_id': instance_ids}) feature_matrix = calculate_feature_matrix( [property_feature, dagg], entityset, cutoff_time=cutoff_time, training_window='2 hours' ) prop_values = [5, 5, 1, 0] dagg_values = [3, 2, 1, 3] feature_matrix.sort_index(inplace=True) assert (feature_matrix[property_feature.get_name()] == prop_values).values.all() assert (feature_matrix[dagg.get_name()] == dagg_values).values.all()
def test_cfm_returns_original_time_indexes(entityset): es = entityset agg_feat = Count(es['customers']['id'], es[u'régions']) dfeat = DirectFeature(agg_feat, es['customers']) agg_feat_2 = Count(es['sessions']['id'], es['customers']) cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'), pd.Timestamp('2011-04-09 10:30:03'), pd.Timestamp('2011-04-08 10:30:00')], 'instance_id': [0, 1, 0]}) sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort') # no approximate fm = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True) instance_level_vals = fm.index.get_level_values(0).values time_level_vals = fm.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, no unapproximated aggs fm2 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in different windows, unapproximated aggs fm2 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="1 m") instance_level_vals = fm2.index.get_level_values(0).values time_level_vals = fm2.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, no unapproximated aggs fm3 = calculate_feature_matrix([dfeat], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all() # approximate, in same window, unapproximated aggs fm3 = calculate_feature_matrix([dfeat, agg_feat_2], entityset, cutoff_time=cutoff_df, cutoff_time_in_index=True, approximate="2 d") instance_level_vals = fm3.index.get_level_values(0).values time_level_vals = fm3.index.get_level_values(1).values assert (instance_level_vals == sorted_df['instance_id'].values).all() assert (time_level_vals == sorted_df['time'].values).all()
def test_dfs_builds_on_seed_features_more_than_max_depth(es): seed_feature_sessions = Count(es['log']["id"], es['sessions']) > 2 seed_feature_log = Hour(es['log']['datetime']) session_agg = Last(seed_feature_log, es['sessions']) # Depth of this feat is 2 relative to session_agg, the seed feature, # which is greater than max_depth so it shouldn't be built session_agg_trans = DirectFeature(Count(session_agg, es['customers']), es['sessions']) dfs_obj = DeepFeatureSynthesis(target_entity_id='sessions', entityset=es, agg_primitives=[Last, Count], trans_primitives=[], max_depth=1, seed_features=[seed_feature_sessions, seed_feature_log]) features = dfs_obj.build_features() assert seed_feature_sessions.get_name() in [f.get_name() for f in features] assert session_agg.get_name() in [f.get_name() for f in features] assert session_agg_trans.get_name() not in [f.get_name() for f in features]
def test_make_dfeat_of_agg_feat_on_self(entityset, backend): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = Count(entityset['customers']['id'], parent_entity=entityset[u'régions']) num_customers_feat = DirectFeature(customer_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_customers_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_make_dfeat_of_agg_feat_on_self(entityset, backend): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = Count(entityset['customers']['id'], parent_entity=entityset['regions']) num_customers_feat = DirectFeature(customer_count_feat, child_entity=entityset['customers']) pandas_backend = backend([num_customers_feat]) df = pandas_backend.calculate_all_features(instance_ids=[0], time_last=None) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_compare_of_direct(es): log_rating = DirectFeature(es['products']['rating'], child_entity=es['log']) to_test = [(Equals, [False, False, False, False]), (NotEquals, [True, True, True, True]), (LessThan, [False, False, False, True]), (LessThanEqualTo, [False, False, False, True]), (GreaterThan, [True, True, True, False]), (GreaterThanEqualTo, [True, True, True, False])] features = [] for test in to_test: features.append(test[0](log_rating, 4.5)) pandas_backend = PandasBackend(es, features) df = pandas_backend.calculate_all_features(instance_ids=[0, 1, 2, 3], time_last=None) for i, test in enumerate(to_test): v = df[features[i].get_name()].values.tolist() assert v == test[1]