def test_topn(es): topn = ft.Feature(es['log']['product_id'], parent_entity=es['customers'], primitive=NMostCommon(n=2)) feature_set = FeatureSet([topn]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) true_results = pd.DataFrame( [['toothpaste', 'coke zero'], ['coke zero', 'Haribo sugar-free gummy bears'], ['taco clock', np.nan]]) assert ([name in df.columns for name in topn.get_feature_names()]) for i in range(df.shape[0]): true = true_results.loc[i] actual = df.loc[i] if i == 0: # coke zero and toothpase have same number of occurrences assert set(true.values) == set(actual.values) else: for i1, i2 in zip(true, actual): assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
def test_two_relationships_to_single_entity(games_es): es = games_es home_team, away_team = es.relationships path = RelationshipPath([(False, home_team)]) mean_at_home = ft.AggregationFeature(es['games']['home_team_score'], es['teams'], relationship_path=path, primitive=ft.primitives.Mean) path = RelationshipPath([(False, away_team)]) mean_at_away = ft.AggregationFeature(es['games']['away_team_score'], es['teams'], relationship_path=path, primitive=ft.primitives.Mean) home_team_mean = ft.DirectFeature(mean_at_home, es['games'], relationship=home_team) away_team_mean = ft.DirectFeature(mean_at_away, es['games'], relationship=away_team) feature_set = FeatureSet([home_team_mean, away_team_mean]) calculator = FeatureSetCalculator(es, time_last=datetime(2011, 8, 28), feature_set=feature_set) df = calculator.run(np.array(range(3))) assert (df[home_team_mean.get_name()] == [1.5, 1.5, 2.5]).all() assert (df[away_team_mean.get_name()] == [1, 0.5, 2]).all()
def test_make_dfeat_of_agg_feat_through_parent(es): """ The graph looks like this: R C = Customers, the entity we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es['stores']['id']) store_count_feat = ft.Feature(store_id_feat, parent_entity=es[u'régions'], primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_stores_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[num_stores_feat.get_name()][0] assert (v == 3)
def test_make_deep_agg_feat_of_dfeat_of_agg_feat(es): """ The graph looks like this (higher implies parent): C C = Customers, the entity we're trying to predict on | S = Sessions, a child of Customers P S L = Log, a child of both Sessions and Log \\ / P = Products, a parent of Log which is not a descendent of customers L We're trying to calculate a DFeat from L to P on an agg_feat of P on L, and then aggregate it with another agg_feat of C on L. """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['products'], primitive=Count) product_purchases_feat = DirectFeature(log_count_feat, child_entity=es['log']) purchase_popularity = ft.Feature(product_purchases_feat, parent_entity=es['customers'], primitive=Mean) feature_set = FeatureSet([purchase_popularity]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[purchase_popularity.get_name()][0] assert (v == 38.0 / 10.0)
def test_make_agg_feat_where_count_or_device_type_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1 or_feat = compare_count.OR(compare_device_type) feat = ft.Feature(es['sessions']['id'], parent_entity=es['customers'], where=or_feat, primitive=Count) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) name = feat.get_name() instances = df[name] assert (instances[0] == 3)
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the entity we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es['customers']['id'], parent_entity=es[u'régions'], primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_entity=es['customers']) feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[num_customers_feat.get_name()][0] assert (v == 3)
def test_make_compare_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) mean_agg_feat = ft.Feature(log_count_feat, parent_entity=es['customers'], primitive=Mean) mean_feat = DirectFeature(mean_agg_feat, child_entity=es['sessions']) feat = log_count_feat > mean_feat feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) name = feat.get_name() instances = df[name] v0, v1, v2 = instances[0:3] assert v0 assert v1 assert not v2
def test_make_agg_feat_where_count_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) feat = ft.Feature(es['sessions']['id'], parent_entity=es['customers'], where=log_count_feat > 1, primitive=Count) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1])) name = feat.get_name() instances = df[name] v0, v1 = instances[0:2] assert (v0 == 2) assert (v1 == 2)
def calc_results(time_last, ids, precalculated_features=None, training_window=None): update_progress_callback = None if progress_bar is not None: def update_progress_callback(done): previous_progress = progress_bar.n progress_bar.update(done * group.shape[0]) if progress_callback is not None: update, progress_percent, time_elapsed = update_progress_callback_parameters( progress_bar, previous_progress) progress_callback(update, progress_percent, time_elapsed) calculator = FeatureSetCalculator( entityset, feature_set, time_last, training_window=training_window, precalculated_features=precalculated_features) matrix = calculator.run(ids, progress_callback=update_progress_callback) return matrix
def test_diff(pd_es): value = ft.Feature(pd_es['log'].ww['value']) customer_id_feat = ft.Feature(pd_es['sessions'].ww['customer_id'], 'log') diff1 = ft.Feature(value, groupby=ft.Feature(pd_es['log'].ww['session_id']), primitive=Diff) diff2 = ft.Feature(value, groupby=customer_id_feat, primitive=Diff) feature_set = FeatureSet([diff1, diff2]) calculator = FeatureSetCalculator(pd_es, feature_set=feature_set) df = calculator.run(np.array(range(15))) val1 = df[diff1.get_name()].tolist() val2 = df[diff2.get_name()].tolist() correct_vals1 = [ np.nan, 5, 5, 5, 5, np.nan, 1, 1, 1, np.nan, np.nan, 5, np.nan, 7, 7 ] correct_vals2 = [np.nan, 5, 5, 5, 5, -20, 1, 1, 1, -3, np.nan, 5, -5, 7, 7] for i, v in enumerate(val1): v1 = val1[i] if np.isnan(v1): assert (np.isnan(correct_vals1[i])) else: assert v1 == correct_vals1[i] v2 = val2[i] if np.isnan(v2): assert (np.isnan(correct_vals2[i])) else: assert v2 == correct_vals2[i]
def test_make_compare_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) mean_agg_feat = ft.Feature(log_count_feat, parent_dataframe_name="customers", primitive=Mean) mean_feat = DirectFeature(mean_agg_feat, child_dataframe_name="sessions") feat = log_count_feat > mean_feat feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1, 2])) df = to_pandas(df, index="id", sort_index=True) name = feat.get_name() instances = df[name] v0, v1, v2 = instances[0:3] assert v0 assert v1 assert not v2
def test_make_agg_feat_multiple_dtypes(es): if any(isinstance(entity.df, dd.DataFrame) for entity in es.entities): pytest.xfail( 'Currently no dask compatible agg prims that use multiple dtypes') compare_prod = IdentityFeature(es['log']['product_id']) == 'coke zero' agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], where=compare_prod, primitive=Count) agg_feat2 = ft.Feature(es['log']['product_id'], parent_entity=es['sessions'], where=compare_prod, primitive=Mode) feature_set = FeatureSet([agg_feat, agg_feat2]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute() v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert (v == 3) assert (v2 == 'coke zero')
def test_make_agg_feat_multiple_dtypes(es): if es.dataframe_type != Library.PANDAS.value: pytest.xfail( "Currently no Dask or Spark compatible agg prims that use multiple dtypes" ) compare_prod = IdentityFeature(es["log"].ww["product_id"]) == "coke zero" agg_feat = ft.Feature( es["log"].ww["id"], parent_dataframe_name="sessions", where=compare_prod, primitive=Count, ) agg_feat2 = ft.Feature( es["log"].ww["product_id"], parent_dataframe_name="sessions", where=compare_prod, primitive=Mode, ) feature_set = FeatureSet([agg_feat, agg_feat2]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] v2 = df[agg_feat2.get_name()][0] assert v == 3 assert v2 == "coke zero"
def test_make_dfeat_of_agg_feat_on_self(es): """ The graph looks like this: R R = Regions, a parent of customers | C C = Customers, the dataframe we're trying to predict on | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on C. """ customer_count_feat = ft.Feature(es["customers"].ww["id"], parent_dataframe_name="régions", primitive=Count) num_customers_feat = DirectFeature(customer_count_feat, child_dataframe_name="customers") feature_set = FeatureSet([num_customers_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[num_customers_feat.get_name()].values[0] assert v == 3
def test_make_dfeat_of_agg_feat_through_parent(es): """ The graph looks like this: R C = Customers, the dataframe we're trying to predict on / \\ R = Regions, a parent of customers S C S = Stores, a child of regions | etc. We're trying to calculate a DFeat from C to R on an agg_feat of R on S. """ store_id_feat = IdentityFeature(es["stores"].ww["id"]) store_count_feat = ft.Feature(store_id_feat, parent_dataframe_name="régions", primitive=Count) num_stores_feat = DirectFeature(store_count_feat, child_dataframe_name="customers") feature_set = FeatureSet([num_stores_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id") v = df[num_stores_feat.get_name()].values[0] assert v == 3
def test_make_agg_feat_where_count_or_device_type_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) compare_count = log_count_feat > 1 compare_device_type = IdentityFeature( es["sessions"].ww["device_type"]) == 1 or_feat = compare_count.OR(compare_device_type) feat = ft.Feature( es["sessions"].ww["id"], parent_dataframe_name="customers", where=or_feat, primitive=Count, ) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) df = to_pandas(df, index="id", int_index=True) name = feat.get_name() instances = df[name] assert instances.values[0] == 3
def test_make_agg_feat_where_count_and_device_type_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) compare_count = log_count_feat == 1 compare_device_type = IdentityFeature(es['sessions']['device_type']) == 1 and_feat = ft.Feature([compare_count, compare_device_type], primitive=And) feat = ft.Feature(es['sessions']['id'], parent_entity=es['customers'], where=and_feat, primitive=Count) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) if isinstance(df, dd.DataFrame): df = df.compute().set_index('id') df.index = pd.Int64Index(df.index) name = feat.get_name() instances = df[name] assert (instances[0] == 1)
def test_make_agg_feat_where_count_feat(es): """ Feature we're creating is: Number of sessions for each customer where the number of logs in the session is less than 3 """ log_count_feat = ft.Feature(es["log"].ww["id"], parent_dataframe_name="sessions", primitive=Count) feat = ft.Feature( es["sessions"].ww["id"], parent_dataframe_name="customers", where=log_count_feat > 1, primitive=Count, ) feature_set = FeatureSet([feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0, 1])) df = to_pandas(df, index="id", sort_index=True) name = feat.get_name() instances = df[name] v0, v1 = instances[0:2] assert v0 == 2 assert v1 == 2
def test_calls_progress_callback(es): # call with all feature types. make sure progress callback calls sum to 1 identity = ft.Feature(es['customers']['age']) direct = ft.Feature(es['cohorts']['cohort_name'], es['customers']) agg = ft.Feature(es["sessions"]["id"], parent_entity=es['customers'], primitive=Count) agg_apply = ft.Feature( es["log"]["datetime"], parent_entity=es['customers'], primitive=TimeSinceLast ) # this feature is handle differently than simple features trans = ft.Feature(agg, primitive=CumSum) groupby_trans = ft.Feature(agg, primitive=CumSum, groupby=es["customers"]["cohort"]) all_features = [identity, direct, agg, agg_apply, trans, groupby_trans] feature_set = FeatureSet(all_features) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) class MockProgressCallback: def __init__(self): self.total = 0 def __call__(self, update): self.total += update mock_progress_callback = MockProgressCallback() instance_ids = [0, 1, 2] calculator.run(np.array(instance_ids), mock_progress_callback) assert np.isclose(mock_progress_callback.total, 1) # testing again with a time_last with no data feature_set = FeatureSet(all_features) calculator = FeatureSetCalculator(es, time_last=pd.Timestamp("1950"), feature_set=feature_set) mock_progress_callback = MockProgressCallback() calculator.run(np.array(instance_ids), mock_progress_callback) assert np.isclose(mock_progress_callback.total, 1)
def test_make_trans_feat(es): f = ft.Feature(es['log']['datetime'], primitive=Hour) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert v == 10
def test_percentile_with_cutoff(es): v = ft.Feature(es['log']['value']) p = ft.Feature(v, primitive=Percentile) feature_set = FeatureSet([p]) calculator = FeatureSetCalculator(es, feature_set, pd.Timestamp('2011/04/09 10:30:13')) df = calculator.run(np.array([2])) assert df[p.get_name()].tolist()[0] == 1.0
def test_diff_single_value(es): diff = ft.Feature(es['stores']['num_square_feet'], groupby=es['stores'][u'région_id'], primitive=Diff) feature_set = FeatureSet([diff]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([4])) assert df[diff.get_name()][4] == 6000.0
def test_direct_from_identity(es): device = es['sessions']['device_type'] d = DirectFeature(base_feature=device, child_entity=es['log']) feature_set = FeatureSet([d]) calculator = FeatureSetCalculator(es, feature_set=feature_set, time_last=None) df = calculator.run([0, 5]) v = df[d.get_name()].tolist() assert v == [0, 1]
def test_diff_single_value_is_nan(es): diff = ft.Feature(es['stores']['num_square_feet'], groupby=es['stores'][u'région_id'], primitive=Diff) feature_set = FeatureSet([diff]) calculator = FeatureSetCalculator(es, feature_set=feature_set) df = calculator.run(np.array([5])) assert df.shape[0] == 1 assert df[diff.get_name()].dropna().shape[0] == 0
def test_precalculated_features(pd_es): error_msg = ( "This primitive should never be used because the features are precalculated" ) class ErrorPrim(AggregationPrimitive): """A primitive whose function raises an error.""" name = "error_prim" input_types = [ColumnSchema(semantic_tags={"numeric"})] return_type = ColumnSchema(semantic_tags={"numeric"}) def get_function(self, agg_type="pandas"): def error(s): raise RuntimeError(error_msg) return error value = ft.Feature(pd_es["log"].ww["value"]) agg = ft.Feature(value, parent_dataframe_name="sessions", primitive=ErrorPrim) agg2 = ft.Feature(agg, parent_dataframe_name="customers", primitive=ErrorPrim) direct = ft.Feature(agg2, dataframe_name="sessions") # Set up a FeatureSet which knows which features are precalculated. precalculated_feature_trie = Trie(default=set, path_constructor=RelationshipPath) precalculated_feature_trie.get_node(direct.relationship_path).value.add( agg2.unique_name()) feature_set = FeatureSet( [direct], approximate_feature_trie=precalculated_feature_trie) # Fake precalculated data. values = [0, 1, 2] parent_fm = pd.DataFrame({agg2.get_name(): values}) precalculated_fm_trie = Trie(path_constructor=RelationshipPath) precalculated_fm_trie.get_node(direct.relationship_path).value = parent_fm calculator = FeatureSetCalculator( pd_es, feature_set=feature_set, precalculated_features=precalculated_fm_trie) instance_ids = [0, 2, 3, 5] fm = calculator.run(np.array(instance_ids)) assert list( fm[direct.get_name()]) == [values[0], values[0], values[1], values[2]] # Calculating without precalculated features should error. with pytest.raises(RuntimeError, match=error_msg): FeatureSetCalculator(pd_es, feature_set=FeatureSet([direct ])).run(instance_ids)
def test_direct_squared(es): feature = IdentityFeature(es['log']['value']) squared = feature * feature feature_set = FeatureSet([feature, squared]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = to_pandas(calculator.run(np.array([0, 1, 2]))) for i, row in df.iterrows(): assert (row[0] * row[0]) == row[1]
def test_make_agg_feat_of_grandchild_entity(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['customers'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 10)
def test_make_identity(es): f = IdentityFeature(es['log']['datetime']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == datetime(2011, 4, 9, 10, 30, 0))
def test_make_dfeat(es): f = DirectFeature(es['customers']['age'], child_entity=es['sessions']) feature_set = FeatureSet([f]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[f.get_name()][0] assert (v == 33)
def test_make_agg_feat_of_identity_index_variable(es): agg_feat = ft.Feature(es['log']['id'], parent_entity=es['sessions'], primitive=Count) feature_set = FeatureSet([agg_feat]) calculator = FeatureSetCalculator(es, time_last=None, feature_set=feature_set) df = calculator.run(np.array([0])) v = df[agg_feat.get_name()][0] assert (v == 5)